import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split # Optional, if splitting manually not used
from sklearn.metrics import (mean_squared_error,mean_absolute_error,mean_absolute_percentage_error,r2_score)
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (Input, Dense, Dropout, Embedding, Flatten, Concatenate, BatchNormalization)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import os
import random
def set_seed(seed=42):
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
set_seed(42)
# Load and clean data
cool = pd.read_csv("C:/Users/Korisnik/Desktop/Škola/THESIS ideas/df_top_cool.csv")
cool.replace('..', np.nan, inplace=True)
cool['date'] = pd.to_datetime(cool[['year', 'month']].assign(day=1))
lag_cols = [col for col in cool.columns if 'lag' in col]
cool = cool.drop(columns=lag_cols, errors='ignore')
cool = cool[~cool['country'].isin(['Russian Federation'])]
# Convert types
cool['year'] = pd.to_numeric(cool['year'], downcast='integer', errors='coerce')
cool['month'] = pd.to_numeric(cool['month'], downcast='integer', errors='coerce')
cool['arrivals'] = pd.to_numeric(cool['arrivals'], errors='coerce')
cool['overnights'] = pd.to_numeric(cool['overnights'], errors='coerce')
cool['unemployment_rate'] = pd.to_numeric(cool['unemployment_rate'], errors='coerce')
cool['exchange_rate'] = pd.to_numeric(cool['exchange_rate'], errors='coerce')
cool['cpi'] = pd.to_numeric(cool['cpi'], errors='coerce')
cool['eu_member'] = cool['eu_member'].astype(int)
cool['euro_adopted'] = cool['euro_adopted'].astype(int)
# Merge Google Trends data
td = pd.read_csv("C:/Users/Korisnik/Desktop/Škola/THESIS ideas/multiTimeline.csv")
cool['date'] = pd.to_datetime(cool['date']).dt.to_period('M').dt.to_timestamp()
td['date'] = pd.to_datetime(td['date']).dt.to_period('M').dt.to_timestamp()
td_long = td.melt(id_vars='date', var_name='country', value_name='google_trends')
cool = cool.merge(td_long, on=['date', 'country'], how='left')
cool['google_trends'] = cool['google_trends'].fillna(-1)
# Sort and log-transform
cool = cool.sort_values(['country', 'date']).reset_index(drop=True)
cool = cool.drop_duplicates(subset=['country', 'date'], keep='first').reset_index(drop=True)
cool['arrivals'] = np.log1p(cool['arrivals'])
cool['overnights'] = np.log1p(cool['overnights'])
cool['exchange_rate'] = np.log1p(cool['exchange_rate'])
cool['cpi'] = np.log1p(cool['cpi'])
cool['overnights_next_month'] = cool.groupby('country')['overnights'].shift(-1)
# One-hot encode month
month_names = {i: month for i, month in enumerate(['January','February','March','April','May','June','July','August','September','October','November','December'], 1)}
cool['month_name'] = cool['month'].map(month_names)
ohe_month = pd.get_dummies(cool['month_name'], prefix='month').astype(int)
cool = pd.concat([cool, ohe_month], axis=1).drop(columns=['month_name'])
# Create lags
lags = {'arrivals': [1, 3, 6, 12], 'overnights': [1, 3, 6, 12], 'cpi': [1], 'unemployment_rate': [3], 'google_trends': [1, 3]}
for var, steps in lags.items():
for lag in steps:
cool[f'{var}_lag_{lag}'] = cool.groupby('country')[var].shift(lag)
cool
# Step 1: Define Schengen entry years
schengen_entry_year = {
'Austria': 1995,
'Belgium': 1995,
'Czech Republic': 2007,
'Denmark': 2001,
'Finland': 1996,
'France': 1995,
'Germany': 1995,
'Hungary': 2007,
'Italy': 1997,
'Netherlands': 1995,
'Norway': 2001,
'Poland': 2007,
'Portugal': 1995,
'Slovakia': 2007,
'Slovenia': 2007,
'Spain': 1995,
'Sweden': 2001,
'Switzerland': 2008,
'Romania': 2024,}
def is_schengen_member(row):
entry_year = schengen_entry_year.get(row['country'], np.inf)
return int(row['year'] >= entry_year)
cool['schengen_member'] = cool.apply(is_schengen_member, axis=1).astype('int8')
# Filter and drop missing
cool.isna().sum().sort_values(ascending=False)
cool = cool[cool['date'] >= '2001-01-01']
cool = cool[cool['overnights_next_month'].notna()].copy()
# Label encode country and scale numeric features
cool['country_encoded'] = LabelEncoder().fit_transform(cool['country'])
month_cols = [col for col in cool.columns if col.startswith('month_')]
X_numeric = cool[[
'unemployment_rate', 'exchange_rate', 'cpi_lag_1',
'overnights_lag_1', 'overnights_lag_3', 'overnights_lag_6', 'overnights_lag_12',
'google_trends_lag_1', 'google_trends_lag_3', 'schengen_member','euro_adopted'] + month_cols].values
X_country_array = cool['country_encoded'].astype('int32').values
y = cool['overnights_next_month'].values
# Time-based split
train_end = pd.Timestamp("2016-12-31")
val_end = pd.Timestamp("2020-12-31")
train_mask = cool['date'] <= train_end
val_mask = (cool['date'] > train_end) & (cool['date'] <= val_end)
test_mask = cool['date'] > val_end
scaler = StandardScaler()
X_num_train = scaler.fit_transform(X_numeric[train_mask])
X_num_val = scaler.transform(X_numeric[val_mask])
X_num_test = scaler.transform(X_numeric[test_mask])
X_numeric_scaled_all = scaler.transform(X_numeric)
X_cat_train = X_country_array[train_mask.to_numpy()]
X_cat_val = X_country_array[val_mask.to_numpy()]
X_cat_test = X_country_array[test_mask.to_numpy()]
y_train = y[train_mask.to_numpy()]
y_val = y[val_mask.to_numpy()]
y_test = y[test_mask.to_numpy()]
# Define model3
input_numeric = Input(shape=(X_num_train.shape[1],), name="numeric_input")
input_country = Input(shape=(1,), dtype='int32', name="country_input")
n_countries = cool['country_encoded'].nunique()
embedding = Embedding(input_dim=n_countries, output_dim=10)(input_country)
embedding_flat = Flatten()(embedding)
x = Concatenate()([input_numeric, embedding_flat])
x = Dense(64, activation='tanh')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Dense(32, activation='tanh')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
output = Dense(1)(x)
model3 = Model(inputs=[input_numeric, input_country], outputs=output)
model3.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
model3.summary()
# Train model3
early_stop = EarlyStopping(monitor='val_loss', patience=40, restore_best_weights=True)
history = model3.fit(
[X_num_train, X_cat_train], y_train,
validation_data=([X_num_val, X_cat_val], y_val),
epochs=100, batch_size=16, callbacks=[early_stop]
)
Model: "functional_26"
┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ Connected to ┃ ┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩ │ country_input │ (None, 1) │ 0 │ - │ │ (InputLayer) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ embedding_26 │ (None, 1, 10) │ 230 │ country_input[0]… │ │ (Embedding) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ numeric_input │ (None, 25) │ 0 │ - │ │ (InputLayer) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ flatten_26 │ (None, 10) │ 0 │ embedding_26[0][… │ │ (Flatten) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ concatenate_26 │ (None, 35) │ 0 │ numeric_input[0]… │ │ (Concatenate) │ │ │ flatten_26[0][0] │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dense_78 (Dense) │ (None, 64) │ 2,304 │ concatenate_26[0… │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ batch_normalizatio… │ (None, 64) │ 256 │ dense_78[0][0] │ │ (BatchNormalizatio… │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dropout_52 │ (None, 64) │ 0 │ batch_normalizat… │ │ (Dropout) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dense_79 (Dense) │ (None, 32) │ 2,080 │ dropout_52[0][0] │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ batch_normalizatio… │ (None, 32) │ 128 │ dense_79[0][0] │ │ (BatchNormalizatio… │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dropout_53 │ (None, 32) │ 0 │ batch_normalizat… │ │ (Dropout) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dense_80 (Dense) │ (None, 1) │ 33 │ dropout_53[0][0] │ └─────────────────────┴───────────────────┴────────────┴───────────────────┘
Total params: 5,031 (19.65 KB)
Trainable params: 4,839 (18.90 KB)
Non-trainable params: 192 (768.00 B)
Epoch 1/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 28s 11ms/step - loss: 89.2403 - val_loss: 35.5990 Epoch 2/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 4s 12ms/step - loss: 24.8093 - val_loss: 2.2839 Epoch 3/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 4.6342 - val_loss: 1.7292 Epoch 4/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 3.3075 - val_loss: 1.7541 Epoch 5/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 2.9630 - val_loss: 1.6112 Epoch 6/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 2.7032 - val_loss: 1.5946 Epoch 7/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 2.4437 - val_loss: 1.5346 Epoch 8/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 8ms/step - loss: 2.0947 - val_loss: 1.5173 Epoch 9/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 4s 12ms/step - loss: 1.9937 - val_loss: 1.5489 Epoch 10/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 5s 10ms/step - loss: 1.8901 - val_loss: 1.5497 Epoch 11/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 3s 9ms/step - loss: 1.7351 - val_loss: 1.6645 Epoch 12/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 1.6302 - val_loss: 1.5062 Epoch 13/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 3s 7ms/step - loss: 1.6041 - val_loss: 1.5826 Epoch 14/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 3s 8ms/step - loss: 1.4879 - val_loss: 1.6785 Epoch 15/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 1.4391 - val_loss: 1.5853 Epoch 16/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 3s 8ms/step - loss: 1.4526 - val_loss: 1.5137 Epoch 17/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 5ms/step - loss: 1.3679 - val_loss: 1.5973 Epoch 18/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 1.4182 - val_loss: 1.5707 Epoch 19/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 1.3537 - val_loss: 1.6190 Epoch 20/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 5s 14ms/step - loss: 1.2815 - val_loss: 1.6774 Epoch 21/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 4s 8ms/step - loss: 1.2749 - val_loss: 1.6142 Epoch 22/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 1.2937 - val_loss: 1.5982 Epoch 23/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 1.2810 - val_loss: 1.6085 Epoch 24/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 1.2816 - val_loss: 1.6001 Epoch 25/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 1.2587 - val_loss: 1.6219 Epoch 26/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.2451 - val_loss: 1.6651 Epoch 27/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.1851 - val_loss: 1.6070 Epoch 28/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.2446 - val_loss: 1.6157 Epoch 29/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.1416 - val_loss: 1.6052 Epoch 30/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.0938 - val_loss: 1.6009 Epoch 31/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 1.1726 - val_loss: 1.6858 Epoch 32/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.1481 - val_loss: 1.6473 Epoch 33/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 3s 8ms/step - loss: 1.1703 - val_loss: 1.6512 Epoch 34/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 1.1190 - val_loss: 1.5767 Epoch 35/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 5ms/step - loss: 1.1113 - val_loss: 1.5925 Epoch 36/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.1510 - val_loss: 1.5767 Epoch 37/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.0962 - val_loss: 1.6330 Epoch 38/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.1153 - val_loss: 1.5236 Epoch 39/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.1686 - val_loss: 1.5506 Epoch 40/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.1147 - val_loss: 1.5003 Epoch 41/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.1116 - val_loss: 1.6148 Epoch 42/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 1.1468 - val_loss: 1.5725 Epoch 43/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 3s 7ms/step - loss: 1.0854 - val_loss: 1.5362 Epoch 44/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.0844 - val_loss: 1.6029 Epoch 45/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 3s 8ms/step - loss: 1.0681 - val_loss: 1.5509 Epoch 46/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 5ms/step - loss: 1.0551 - val_loss: 1.5586 Epoch 47/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.0376 - val_loss: 1.6248 Epoch 48/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.0158 - val_loss: 1.5275 Epoch 49/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 3s 8ms/step - loss: 1.0508 - val_loss: 1.5094 Epoch 50/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 5ms/step - loss: 1.0701 - val_loss: 1.5162 Epoch 51/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.0496 - val_loss: 1.4887 Epoch 52/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 3s 7ms/step - loss: 1.0185 - val_loss: 1.5744 Epoch 53/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.0444 - val_loss: 1.4563 Epoch 54/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.0266 - val_loss: 1.5447 Epoch 55/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.9903 - val_loss: 1.5421 Epoch 56/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 1.0146 - val_loss: 1.6193 Epoch 57/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 5ms/step - loss: 1.0453 - val_loss: 1.5150 Epoch 58/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.0113 - val_loss: 1.4521 Epoch 59/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 3s 7ms/step - loss: 0.9473 - val_loss: 1.5612 Epoch 60/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 5ms/step - loss: 0.9989 - val_loss: 1.6867 Epoch 61/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.9996 - val_loss: 1.4673 Epoch 62/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 3s 8ms/step - loss: 0.9882 - val_loss: 1.5372 Epoch 63/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 5ms/step - loss: 0.9569 - val_loss: 1.4765 Epoch 64/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.0222 - val_loss: 1.5254 Epoch 65/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.9942 - val_loss: 1.5092 Epoch 66/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.9570 - val_loss: 1.4974 Epoch 67/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.9730 - val_loss: 1.5087 Epoch 68/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.9471 - val_loss: 1.4377 Epoch 69/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.9546 - val_loss: 1.5303 Epoch 70/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.9070 - val_loss: 1.4746 Epoch 71/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 0.9737 - val_loss: 1.5554 Epoch 72/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.9728 - val_loss: 1.5023 Epoch 73/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.0142 - val_loss: 1.5233 Epoch 74/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 3s 7ms/step - loss: 0.9267 - val_loss: 1.4812 Epoch 75/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 0.9398 - val_loss: 1.4838 Epoch 76/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.9100 - val_loss: 1.4992 Epoch 77/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.8853 - val_loss: 1.4566 Epoch 78/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 5ms/step - loss: 0.9416 - val_loss: 1.4769 Epoch 79/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 3s 10ms/step - loss: 0.8910 - val_loss: 1.5021 Epoch 80/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 0.9447 - val_loss: 1.4970 Epoch 81/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 0.9396 - val_loss: 1.5415 Epoch 82/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.9037 - val_loss: 1.4931 Epoch 83/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.8417 - val_loss: 1.4645 Epoch 84/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 0.9084 - val_loss: 1.4984 Epoch 85/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.8796 - val_loss: 1.4698 Epoch 86/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.8927 - val_loss: 1.4486 Epoch 87/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 3s 7ms/step - loss: 0.9113 - val_loss: 1.5172 Epoch 88/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.8731 - val_loss: 1.5109 Epoch 89/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.8907 - val_loss: 1.4753 Epoch 90/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 5ms/step - loss: 0.8842 - val_loss: 1.4480 Epoch 91/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 0.8521 - val_loss: 1.4783 Epoch 92/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.9067 - val_loss: 1.4194 Epoch 93/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.8771 - val_loss: 1.4823 Epoch 94/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 8ms/step - loss: 0.8827 - val_loss: 1.4256 Epoch 95/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.8614 - val_loss: 1.4351 Epoch 96/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 8ms/step - loss: 0.8507 - val_loss: 1.5070 Epoch 97/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 5ms/step - loss: 0.8448 - val_loss: 1.4578 Epoch 98/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 0.8651 - val_loss: 1.5128 Epoch 99/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.8209 - val_loss: 1.4792 Epoch 100/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.8280 - val_loss: 1.4589
# Baseline predictions
baseline_preds = model3.predict([X_num_test, X_cat_test]).flatten()
baseline_mse = mean_squared_error(y_test, baseline_preds)
feature_names = [
'unemployment_rate', 'exchange_rate', 'cpi_lag_1',
'overnights_lag_1', 'overnights_lag_3', 'overnights_lag_6', 'overnights_lag_12',
'google_trends_lag_1', 'google_trends_lag_3', 'schengen_member', 'euro_adopted']+month_cols
importances = []
for i in range(X_num_test.shape[1]):
X_permuted = X_num_test.copy()
X_permuted[:, i] = np.random.permutation(X_permuted[:, i])
permuted_preds = model3.predict([X_permuted, X_cat_test]).flatten()
permuted_mse = mean_squared_error(y_test, permuted_preds)
importances.append(max(0, permuted_mse - baseline_mse))
#Handle grouped month dummies so its not one bar for each month
month_indices = [i for i, f in enumerate(feature_names) if f.startswith('month_')]
if month_indices:
# Safe grouped permutation
X_month_permuted = X_num_test.copy()
row_perm = np.random.permutation(len(X_month_permuted))
X_month_permuted[:, month_indices] = X_month_permuted[row_perm][:, month_indices]
month_preds = model3.predict([X_month_permuted, X_cat_test]).flatten()
month_importance = max(0, mean_squared_error(y_test, month_preds) - baseline_mse)
else:
month_importance = 0 # fallback if no month cols present
filtered_names = [f for i, f in enumerate(feature_names) if i not in month_indices]
filtered_importances = [imp for i, imp in enumerate(importances) if i not in month_indices]
feature_names_final = filtered_names + ['month_group']
importances_final = filtered_importances + [month_importance]
sorted_pairs = sorted(zip(importances_final, feature_names_final), reverse=True)
importances_sorted, feature_names_sorted = zip(*sorted_pairs)
plt.figure(figsize=(12, 6))
bars = plt.barh(feature_names_sorted, importances_sorted)
plt.xlabel("Increase in MSE when shuffled")
plt.title("Model 3: Overnights (2024)")
plt.gca().invert_yaxis()
# Add labels
for bar in bars:
width = bar.get_width()
plt.text(width + 0.001, bar.get_y() + bar.get_height() / 2,
f"{width:.4f}", va='center')
plt.tight_layout()
plt.show()
34/34 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 10ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step
## Model evalauation
# Predict and flatten
train_preds = model3.predict([X_num_train, X_cat_train]).flatten()
test_preds = model3.predict([X_num_test, X_cat_test]).flatten()
# MSE
train_mse = mean_squared_error(y_train, train_preds)
test_mse = mean_squared_error(y_test, test_preds)
# RMSE
train_rmse3 = np.sqrt(train_mse)
test_rmse3 = np.sqrt(test_mse)
#MAE
train_mae3 = mean_absolute_error(y_train, train_preds)
test_mae3 = mean_absolute_error(y_test, test_preds)
# MAPE
train_mape3 = mean_absolute_percentage_error(y_train, train_preds)
test_mape3 = mean_absolute_percentage_error(y_test, test_preds)
# R-squared
train_r2_3 = r2_score(y_train, train_preds)
test_r2_3 = r2_score(y_test, test_preds)
# Print all metrics
print(f"Train MSE: {train_mse:.4f}, RMSE: {train_rmse3:.4f}, MAE: {train_mae3:.4f}, MAPE: {train_mape3:.4f}, R²: {train_r2_3:.4f}")
print(f"Test MSE: {test_mse:.4f}, RMSE: {test_rmse3:.4f}, MAE: {test_mae3:.4f}, MAPE: {test_mape3:.4f}, R²: {test_r2_3:.4f}")
# Plot training history
plt.figure(figsize=(10, 4))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training History')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.legend()
plt.grid(True)
plt.show()
138/138 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step Train MSE: 0.0999, RMSE: 0.3161, MAE: 0.2388, MAPE: 0.0260, R²: 0.9798 Test MSE: 0.3100, RMSE: 0.5568, MAE: 0.4334, MAPE: 0.0432, R²: 0.9311
y_pred_all = model3.predict([X_numeric_scaled_all, X_country_array]).flatten()
y_actual_all = np.expm1(y)
y_pred_all = np.expm1(y_pred_all)
df_plot = cool.copy()
df_plot['actual'] = y_actual_all
df_plot['predicted'] = y_pred_all
monthly_totals_all = df_plot.groupby('date')[['actual', 'predicted']].sum()
plt.figure(figsize=(16, 6))
plt.plot(monthly_totals_all.index, monthly_totals_all['actual'], label='Total Actual', linewidth=2)
plt.plot(monthly_totals_all.index, monthly_totals_all['predicted'], label='Total Predicted', linestyle='--', alpha=0.8)
plt.title("Total Monthly Overnights (2024)")
plt.xlabel("Date")
plt.ylabel("Total Overnights")
plt.xticks(rotation=90)
plt.ticklabel_format(style='plain', axis='y')
plt.legend()
plt.tight_layout()
ax = plt.gca()
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=6))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.xticks(rotation=45)
plt.xlim(monthly_totals_all.index.min(), monthly_totals_all.index.max())
plt.axvline(x=train_end, color='gray', linestyle='--', linewidth=1.5, label='Train/Val Split')
plt.axvline(x=val_end, color='black', linestyle='--', linewidth=1.5, label='Val/Test Split')
plt.text(train_end, plt.ylim()[1]*0.95, 'Training set', rotation=90, ha='right', va='top', color='gray')
plt.text(val_end, plt.ylim()[1]*0.95, 'Testing set', rotation=90, ha='right', va='top', color='black')
plt.grid(True, linestyle='--', linewidth=0.5, alpha=0.6)
plt.show()
207/207 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step
# Country-level plots
countries = df_plot['country'].unique()
fig, axes = plt.subplots(len(countries) // 3 + 1, 3, figsize=(18, 3 * (len(countries) // 3 + 1)), sharex=False, sharey=False)
axes = axes.flatten()
for i, country in enumerate(countries):
country_df = df_plot[df_plot['country'] == country].groupby('date')[['actual', 'predicted']].sum()
ax = axes[i]
ax.plot(country_df.index, country_df['actual'], label='Actual')
ax.plot(country_df.index, country_df['predicted'], label='Predicted', linestyle='--')
ax.set_title(country)
# Show x-axis ticks every 3 years
ax.xaxis.set_major_locator(mdates.YearLocator(base=3))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
ax.tick_params(axis='x', rotation=45)
ax.grid(True, linestyle='--', linewidth=0.5, alpha=0.6)
# Add legend only once
if i == 0:
ax.legend()
# Remove unused subplots
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
fig.tight_layout()
fig.suptitle('Actual vs Predicted Monthly Overnights per Country (2024)', fontsize=16, y=1.02)
plt.show()
# df_plot: DataFrame with columns ['country', 'date', 'actual', 'predicted']
# List of countries
countries = df_plot['country'].unique()
results = []
for country in countries:
# Subset data for this country (test set only, if you want)
country_df = df_plot[(df_plot['country'] == country) & (df_plot['date'] > val_end)]
actual = country_df['actual'].values
pred = country_df['predicted'].values
if len(actual) == 0:
continue # Skip countries with no data in test set
mse = round(mean_squared_error(actual, pred),6)
rmse = np.sqrt(mse)
mae = mean_absolute_error(actual, pred)
mape = mean_absolute_percentage_error(actual, pred)
r2 = r2_score(actual, pred)
results.append({
'country': country,
'MSE': mse,
'RMSE': rmse,
'MAE': mae,
'MAPE': mape,
'R2': r2
})
# Convert to DataFrame for easy export or display
country_metrics = pd.DataFrame(results)
print(country_metrics)
country MSE RMSE MAE \
0 Austria 1.028618e+11 3.207207e+05 2.174073e+05
1 Belgium 2.086010e+09 4.567286e+04 2.272012e+04
2 Bosnia and Herzegovina 8.145445e+09 9.025212e+04 4.634687e+04
3 Canada 1.774758e+08 1.332201e+04 8.498550e+03
4 Czech Republic 6.500328e+10 2.549574e+05 1.214085e+05
5 Denmark 3.653635e+09 6.044530e+04 2.052040e+04
6 Finland 2.066050e+08 1.437376e+04 8.788207e+03
7 France 3.886561e+09 6.234229e+04 3.070490e+04
8 Germany 4.779323e+12 2.186166e+06 1.200339e+06
9 Hungary 1.688156e+10 1.299290e+05 6.355589e+04
10 Ireland 4.933720e+08 2.221198e+04 1.300958e+04
11 Italy 2.409223e+10 1.552167e+05 7.373700e+04
12 Netherlands 2.083571e+10 1.443458e+05 6.193622e+04
13 Norway 1.445879e+09 3.802472e+04 1.362845e+04
14 Poland 1.666509e+11 4.082290e+05 2.289431e+05
15 Romania 5.463797e+08 2.337477e+04 1.230082e+04
16 Slovakia 3.082129e+09 5.551692e+04 2.890555e+04
17 Slovenia 1.659923e+11 4.074216e+05 2.047528e+05
18 Spain 1.214496e+08 1.102042e+04 6.626548e+03
19 Sweden 1.519215e+09 3.897710e+04 1.818992e+04
20 Switzerland 4.152904e+09 6.444303e+04 3.543848e+04
21 USA 7.544876e+09 8.686125e+04 6.149310e+04
22 United Kingdom 4.076366e+10 2.019001e+05 1.295231e+05
MAPE R2
0 0.576117 0.776627
1 0.632651 0.806398
2 0.370813 0.745085
3 0.440671 0.789588
4 0.283152 0.848905
5 0.366774 0.822101
6 0.643731 0.716840
7 0.298985 0.898379
8 0.478053 0.160870
9 0.358179 0.900508
10 0.541236 0.686613
11 0.309986 0.894494
12 0.296124 0.856792
13 0.561548 0.729123
14 0.392675 0.731027
15 0.339013 0.847686
16 0.244617 0.982166
17 0.475613 0.761378
18 0.302552 0.942903
19 0.326946 0.878634
20 0.300017 0.717833
21 0.436030 0.403028
22 0.678253 0.495580
# Load and clean data
cool = pd.read_csv("C:/Users/Korisnik/Desktop/Škola/THESIS ideas/df_top_cool.csv")
cool.replace('..', np.nan, inplace=True)
cool['date'] = pd.to_datetime(cool[['year', 'month']].assign(day=1))
lag_cols = [col for col in cool.columns if 'lag' in col]
cool = cool.drop(columns=lag_cols, errors='ignore')
cool = cool[~cool['country'].isin(['Russian Federation'])]
# Convert types
cool['year'] = pd.to_numeric(cool['year'], downcast='integer', errors='coerce')
cool['month'] = pd.to_numeric(cool['month'], downcast='integer', errors='coerce')
cool['arrivals'] = pd.to_numeric(cool['arrivals'], errors='coerce')
cool['overnights'] = pd.to_numeric(cool['overnights'], errors='coerce')
cool['unemployment_rate'] = pd.to_numeric(cool['unemployment_rate'], errors='coerce')
cool['exchange_rate'] = pd.to_numeric(cool['exchange_rate'], errors='coerce')
cool['cpi'] = pd.to_numeric(cool['cpi'], errors='coerce')
cool['eu_member'] = cool['eu_member'].astype(int)
cool['euro_adopted'] = cool['euro_adopted'].astype(int)
# Merge Google Trends data
td = pd.read_csv("C:/Users/Korisnik/Desktop/Škola/THESIS ideas/multiTimeline.csv")
cool['date'] = pd.to_datetime(cool['date']).dt.to_period('M').dt.to_timestamp()
td['date'] = pd.to_datetime(td['date']).dt.to_period('M').dt.to_timestamp()
td_long = td.melt(id_vars='date', var_name='country', value_name='google_trends')
cool = cool.merge(td_long, on=['date', 'country'], how='left')
cool['google_trends'] = cool['google_trends'].fillna(-1)
# Sort and log-transform
cool = cool.sort_values(['country', 'date']).reset_index(drop=True)
cool = cool.drop_duplicates(subset=['country', 'date'], keep='first').reset_index(drop=True)
cool['arrivals'] = np.log1p(cool['arrivals'])
cool['overnights'] = np.log1p(cool['overnights'])
cool['exchange_rate'] = np.log1p(cool['exchange_rate'])
cool['cpi'] = np.log1p(cool['cpi'])
cool['overnights_next_month'] = cool.groupby('country')['overnights'].shift(-1)
# One-hot encode month
month_names = {i: month for i, month in enumerate(['January','February','March','April','May','June','July','August','September','October','November','December'], 1)}
cool['month_name'] = cool['month'].map(month_names)
ohe_month = pd.get_dummies(cool['month_name'], prefix='month').astype(int)
cool = pd.concat([cool, ohe_month], axis=1).drop(columns=['month_name'])
# Create lags
lags = {'arrivals': [1, 3, 6, 12], 'overnights': [1, 3, 6, 12], 'cpi': [1], 'unemployment_rate': [3], 'google_trends': [1, 3]}
for var, steps in lags.items():
for lag in steps:
cool[f'{var}_lag_{lag}'] = cool.groupby('country')[var].shift(lag)
cool
# Step 1: Define Schengen entry years
schengen_entry_year = {
'Austria': 1995,
'Belgium': 1995,
'Czech Republic': 2007,
'Denmark': 2001,
'Finland': 1996,
'France': 1995,
'Germany': 1995,
'Hungary': 2007,
'Italy': 1997,
'Netherlands': 1995,
'Norway': 2001,
'Poland': 2007,
'Portugal': 1995,
'Slovakia': 2007,
'Slovenia': 2007,
'Spain': 1995,
'Sweden': 2001,
'Switzerland': 2008,
'Romania': 2024,}
def is_schengen_member(row):
entry_year = schengen_entry_year.get(row['country'], np.inf)
return int(row['year'] >= entry_year)
cool['schengen_member'] = cool.apply(is_schengen_member, axis=1).astype('int8')
# Filter and drop missing
cool.isna().sum().sort_values(ascending=False)
cool = cool[cool['date'] >= '2001-01-01']
cool = cool[cool['date'] <= '2019-12-01']
cool = cool[cool['overnights_next_month'].notna()].copy()
# Label encode country and scale numeric features
cool['country_encoded'] = LabelEncoder().fit_transform(cool['country'])
month_cols = [col for col in cool.columns if col.startswith('month_')]
X_numeric = cool[[
'unemployment_rate', 'exchange_rate', 'cpi_lag_1',
'overnights_lag_1', 'overnights_lag_3', 'overnights_lag_6', 'overnights_lag_12',
'google_trends_lag_1', 'google_trends_lag_3', 'schengen_member','euro_adopted'] + month_cols].values
X_country_array = cool['country_encoded'].astype('int32').values
y = cool['overnights_next_month'].values
# Time-based split
train_end = pd.Timestamp("2014-12-31")
val_end = pd.Timestamp("2017-12-31")
train_mask = cool['date'] <= train_end
val_mask = (cool['date'] > train_end) & (cool['date'] <= val_end)
test_mask = cool['date'] > val_end
scaler = StandardScaler()
X_num_train = scaler.fit_transform(X_numeric[train_mask])
X_num_val = scaler.transform(X_numeric[val_mask])
X_num_test = scaler.transform(X_numeric[test_mask])
X_numeric_scaled_all = scaler.transform(X_numeric)
X_cat_train = X_country_array[train_mask.to_numpy()]
X_cat_val = X_country_array[val_mask.to_numpy()]
X_cat_test = X_country_array[test_mask.to_numpy()]
y_train = y[train_mask.to_numpy()]
y_val = y[val_mask.to_numpy()]
y_test = y[test_mask.to_numpy()]
# Define model4
input_numeric = Input(shape=(X_num_train.shape[1],), name="numeric_input")
input_country = Input(shape=(1,), dtype='int32', name="country_input")
n_countries = cool['country_encoded'].nunique()
embedding = Embedding(input_dim=n_countries, output_dim=10)(input_country)
embedding_flat = Flatten()(embedding)
x = Concatenate()([input_numeric, embedding_flat])
x = Dense(64, activation='tanh')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Dense(32, activation='tanh')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
output = Dense(1)(x)
model4 = Model(inputs=[input_numeric, input_country], outputs=output)
model4.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
model4.summary()
# Train mode4
early_stop = EarlyStopping(monitor='val_loss', patience=40, restore_best_weights=True)
history = model4.fit(
[X_num_train, X_cat_train], y_train,
validation_data=([X_num_val, X_cat_val], y_val),
epochs=100, batch_size=16, callbacks=[early_stop]
)
Model: "functional_36"
┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ Connected to ┃ ┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩ │ country_input │ (None, 1) │ 0 │ - │ │ (InputLayer) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ embedding_36 │ (None, 1, 10) │ 230 │ country_input[0]… │ │ (Embedding) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ numeric_input │ (None, 25) │ 0 │ - │ │ (InputLayer) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ flatten_36 │ (None, 10) │ 0 │ embedding_36[0][… │ │ (Flatten) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ concatenate_36 │ (None, 35) │ 0 │ numeric_input[0]… │ │ (Concatenate) │ │ │ flatten_36[0][0] │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dense_108 (Dense) │ (None, 64) │ 2,304 │ concatenate_36[0… │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ batch_normalizatio… │ (None, 64) │ 256 │ dense_108[0][0] │ │ (BatchNormalizatio… │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dropout_72 │ (None, 64) │ 0 │ batch_normalizat… │ │ (Dropout) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dense_109 (Dense) │ (None, 32) │ 2,080 │ dropout_72[0][0] │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ batch_normalizatio… │ (None, 32) │ 128 │ dense_109[0][0] │ │ (BatchNormalizatio… │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dropout_73 │ (None, 32) │ 0 │ batch_normalizat… │ │ (Dropout) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dense_110 (Dense) │ (None, 1) │ 33 │ dropout_73[0][0] │ └─────────────────────┴───────────────────┴────────────┴───────────────────┘
Total params: 5,031 (19.65 KB)
Trainable params: 4,839 (18.90 KB)
Non-trainable params: 192 (768.00 B)
Epoch 1/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 12s 8ms/step - loss: 90.7974 - val_loss: 46.7961 Epoch 2/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 33.5081 - val_loss: 3.1799 Epoch 3/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 5.2712 - val_loss: 0.7961 Epoch 4/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 3.8846 - val_loss: 0.5789 Epoch 5/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 3.2203 - val_loss: 0.5077 Epoch 6/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 2.9854 - val_loss: 0.5004 Epoch 7/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 8ms/step - loss: 2.8682 - val_loss: 0.4250 Epoch 8/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 2.5530 - val_loss: 0.4092 Epoch 9/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 2.3035 - val_loss: 0.4177 Epoch 10/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 2.0789 - val_loss: 0.3989 Epoch 11/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 3s 9ms/step - loss: 1.9518 - val_loss: 0.3952 Epoch 12/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 1.9176 - val_loss: 0.3784 Epoch 13/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.7907 - val_loss: 0.4061 Epoch 14/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 1.6883 - val_loss: 0.3758 Epoch 15/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 1.6661 - val_loss: 0.3739 Epoch 16/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 1.6102 - val_loss: 0.3784 Epoch 17/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 1.5431 - val_loss: 0.3768 Epoch 18/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 1.5284 - val_loss: 0.4029 Epoch 19/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 3s 8ms/step - loss: 1.5092 - val_loss: 0.3586 Epoch 20/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 1.3832 - val_loss: 0.3747 Epoch 21/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 1.4667 - val_loss: 0.3209 Epoch 22/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 1.4723 - val_loss: 0.3727 Epoch 23/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.3939 - val_loss: 0.4069 Epoch 24/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 1.3451 - val_loss: 0.4176 Epoch 25/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 4s 15ms/step - loss: 1.3533 - val_loss: 0.3674 Epoch 26/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 1.3464 - val_loss: 0.3010 Epoch 27/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 1.3440 - val_loss: 0.3922 Epoch 28/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.2810 - val_loss: 0.3386 Epoch 29/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.2191 - val_loss: 0.3492 Epoch 30/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 1.2700 - val_loss: 0.3981 Epoch 31/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 1.3477 - val_loss: 0.3905 Epoch 32/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 1.3152 - val_loss: 0.3948 Epoch 33/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 1.2744 - val_loss: 0.3904 Epoch 34/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 1.2857 - val_loss: 0.3538 Epoch 35/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 3s 6ms/step - loss: 1.2550 - val_loss: 0.3604 Epoch 36/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 1.2832 - val_loss: 0.3851 Epoch 37/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 8ms/step - loss: 1.2083 - val_loss: 0.3713 Epoch 38/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 1.2435 - val_loss: 0.3195 Epoch 39/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.2570 - val_loss: 0.3591 Epoch 40/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 1.2140 - val_loss: 0.3145 Epoch 41/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 3s 6ms/step - loss: 1.3116 - val_loss: 0.3504 Epoch 42/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 3s 8ms/step - loss: 1.2215 - val_loss: 0.3300 Epoch 43/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 1.2311 - val_loss: 0.2900 Epoch 44/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 3s 7ms/step - loss: 1.2804 - val_loss: 0.3123 Epoch 45/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 1.2416 - val_loss: 0.2690 Epoch 46/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 3s 8ms/step - loss: 1.2357 - val_loss: 0.3585 Epoch 47/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 1.2360 - val_loss: 0.3480 Epoch 48/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 3s 10ms/step - loss: 1.1846 - val_loss: 0.2819 Epoch 49/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 1.2401 - val_loss: 0.3116 Epoch 50/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 3s 6ms/step - loss: 1.2199 - val_loss: 0.3275 Epoch 51/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 1.1878 - val_loss: 0.3304 Epoch 52/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 1.2340 - val_loss: 0.2855 Epoch 53/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 5ms/step - loss: 1.2356 - val_loss: 0.3079 Epoch 54/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.1982 - val_loss: 0.3781 Epoch 55/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.1904 - val_loss: 0.3046 Epoch 56/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.1438 - val_loss: 0.3088 Epoch 57/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.1583 - val_loss: 0.3761 Epoch 58/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.1836 - val_loss: 0.3130 Epoch 59/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.1245 - val_loss: 0.3135 Epoch 60/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.1463 - val_loss: 0.3537 Epoch 61/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.1222 - val_loss: 0.3582 Epoch 62/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.1204 - val_loss: 0.3066 Epoch 63/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.0663 - val_loss: 0.2981 Epoch 64/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.1196 - val_loss: 0.3340 Epoch 65/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.0988 - val_loss: 0.2898 Epoch 66/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.1041 - val_loss: 0.3006 Epoch 67/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.0544 - val_loss: 0.2970 Epoch 68/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.1703 - val_loss: 0.2498 Epoch 69/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.0619 - val_loss: 0.2769 Epoch 70/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.1206 - val_loss: 0.2586 Epoch 71/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.1017 - val_loss: 0.2959 Epoch 72/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.1011 - val_loss: 0.2986 Epoch 73/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.1399 - val_loss: 0.2408 Epoch 74/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.0888 - val_loss: 0.3087 Epoch 75/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.1052 - val_loss: 0.2366 Epoch 76/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.0449 - val_loss: 0.3115 Epoch 77/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.1046 - val_loss: 0.2994 Epoch 78/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.0914 - val_loss: 0.3066 Epoch 79/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.0305 - val_loss: 0.2944 Epoch 80/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.0550 - val_loss: 0.2969 Epoch 81/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.0954 - val_loss: 0.3187 Epoch 82/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.1236 - val_loss: 0.2890 Epoch 83/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.0461 - val_loss: 0.2956 Epoch 84/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.0617 - val_loss: 0.2678 Epoch 85/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.0369 - val_loss: 0.2509 Epoch 86/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.0528 - val_loss: 0.3236 Epoch 87/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.0568 - val_loss: 0.2442 Epoch 88/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.0549 - val_loss: 0.2777 Epoch 89/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.0523 - val_loss: 0.2935 Epoch 90/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.0278 - val_loss: 0.2798 Epoch 91/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 0.9610 - val_loss: 0.3066 Epoch 92/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.0297 - val_loss: 0.2243 Epoch 93/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.0141 - val_loss: 0.2905 Epoch 94/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.0028 - val_loss: 0.3072 Epoch 95/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.0506 - val_loss: 0.2882 Epoch 96/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.0073 - val_loss: 0.2186 Epoch 97/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.0086 - val_loss: 0.3141 Epoch 98/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 0.9936 - val_loss: 0.2790 Epoch 99/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 0.9645 - val_loss: 0.3335 Epoch 100/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.0158 - val_loss: 0.2461
# Baseline predictions
baseline_preds = model4.predict([X_num_test, X_cat_test]).flatten()
baseline_mse = mean_squared_error(y_test, baseline_preds)
feature_names = [
'unemployment_rate', 'exchange_rate', 'cpi_lag_1',
'overnights_lag_1', 'overnights_lag_3', 'overnights_lag_6', 'overnights_lag_12',
'google_trends_lag_1', 'google_trends_lag_3', 'schengen_member', 'euro_adopted']+month_cols
importances = []
for i in range(X_num_test.shape[1]):
X_permuted = X_num_test.copy()
X_permuted[:, i] = np.random.permutation(X_permuted[:, i])
permuted_preds = model4.predict([X_permuted, X_cat_test]).flatten()
permuted_mse = mean_squared_error(y_test, permuted_preds)
importances.append(max(0, permuted_mse - baseline_mse))
#Handle grouped month dummies so its not one bar for each month
month_indices = [i for i, f in enumerate(feature_names) if f.startswith('month_')]
if month_indices:
# Safe grouped permutation
X_month_permuted = X_num_test.copy()
row_perm = np.random.permutation(len(X_month_permuted))
X_month_permuted[:, month_indices] = X_month_permuted[row_perm][:, month_indices]
month_preds = model4.predict([X_month_permuted, X_cat_test]).flatten()
month_importance = max(0, mean_squared_error(y_test, month_preds) - baseline_mse)
else:
month_importance = 0 # fallback if no month cols present
filtered_names = [f for i, f in enumerate(feature_names) if i not in month_indices]
filtered_importances = [imp for i, imp in enumerate(importances) if i not in month_indices]
feature_names_final = filtered_names + ['month_group']
importances_final = filtered_importances + [month_importance]
sorted_pairs = sorted(zip(importances_final, feature_names_final), reverse=True)
importances_sorted, feature_names_sorted = zip(*sorted_pairs)
plt.figure(figsize=(12, 6))
bars = plt.barh(feature_names_sorted, importances_sorted)
plt.xlabel("Increase in MSE when shuffled")
plt.title("Model 3: Overnights (2019)")
plt.gca().invert_yaxis()
# Add labels
for bar in bars:
width = bar.get_width()
plt.text(width + 0.001, bar.get_y() + bar.get_height() / 2,
f"{width:.4f}", va='center')
plt.tight_layout()
plt.show()
18/18 ━━━━━━━━━━━━━━━━━━━━ 1s 18ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 12ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step
## Model evalauation
# Predict and flatten
train_preds = model4.predict([X_num_train, X_cat_train]).flatten()
test_preds = model4.predict([X_num_test, X_cat_test]).flatten()
# MSE
train_mse = mean_squared_error(y_train, train_preds)
test_mse = mean_squared_error(y_test, test_preds)
# RMSE
train_rmse4 = np.sqrt(train_mse)
test_rmse4 = np.sqrt(test_mse)
#MAE
train_mae4 = mean_absolute_error(y_train, train_preds)
test_mae4 = mean_absolute_error(y_test, test_preds)
# MAPE
train_mape4 = mean_absolute_percentage_error(y_train, train_preds)
test_mape4 = mean_absolute_percentage_error(y_test, test_preds)
# R-squared
train_r2_4 = r2_score(y_train, train_preds)
test_r2_4 = r2_score(y_test, test_preds)
# Print all metrics
print(f"Train MSE: {train_mse:.4f}, RMSE: {train_rmse4:.4f}, MAE: {train_mae4:.4f}, MAPE: {train_mape4:.4f}, R²: {train_r2_4:.4f}")
print(f"Test MSE: {test_mse:.4f}, RMSE: {test_rmse4:.4f}, MAE: {test_mae4:.4f}, MAPE: {test_mape4:.4f}, R²: {test_r2_4:.4f}")
# Plot training history
plt.figure(figsize=(10, 4))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training History')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.legend()
plt.grid(True)
plt.show()
121/121 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step Train MSE: 0.1149, RMSE: 0.3390, MAE: 0.2549, MAPE: 0.0277, R²: 0.9768 Test MSE: 0.3801, RMSE: 0.6165, MAE: 0.4971, MAPE: 0.0493, R²: 0.9077
# Global trend plot (total actual vs predicted per month)
y_pred_all = model4.predict([X_numeric_scaled_all, X_country_array]).flatten()
y_actual_all = np.expm1(y)
y_pred_all = np.expm1(y_pred_all)
df_plot = cool.copy()
df_plot['actual'] = y_actual_all
df_plot['predicted'] = y_pred_all
monthly_totals_all = df_plot.groupby('date')[['actual', 'predicted']].sum()
plt.figure(figsize=(16, 6))
plt.plot(monthly_totals_all.index, monthly_totals_all['actual'], label='Total Actual', linewidth=2)
plt.plot(monthly_totals_all.index, monthly_totals_all['predicted'], label='Total Predicted', linestyle='--', alpha=0.8)
plt.title("Total Monthly Overnights (2019)")
plt.xlabel("Date")
plt.ylabel("Total Overnights")
plt.xticks(rotation=90)
plt.ticklabel_format(style='plain', axis='y')
plt.legend()
plt.tight_layout()
ax = plt.gca()
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=6))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.xticks(rotation=45)
plt.xlim(monthly_totals_all.index.min(), monthly_totals_all.index.max())
plt.axvline(x=train_end, color='gray', linestyle='--', linewidth=1.5, label='Train/Val Split')
plt.axvline(x=val_end, color='black', linestyle='--', linewidth=1.5, label='Val/Test Split')
plt.text(train_end, plt.ylim()[1]*0.95, 'Training set', rotation=90, ha='right', va='top', color='gray')
plt.text(val_end, plt.ylim()[1]*0.95, 'Testing set', rotation=90, ha='right', va='top', color='black')
plt.grid(True, linestyle='--', linewidth=0.5, alpha=0.6)
plt.show()
164/164 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step
# Country-level plots
countries = df_plot['country'].unique()
fig, axes = plt.subplots(len(countries) // 3 + 1, 3, figsize=(18, 3 * (len(countries) // 3 + 1)), sharex=False, sharey=False)
axes = axes.flatten()
for i, country in enumerate(countries):
country_df = df_plot[df_plot['country'] == country].groupby('date')[['actual', 'predicted']].sum()
ax = axes[i]
ax.plot(country_df.index, country_df['actual'], label='Actual')
ax.plot(country_df.index, country_df['predicted'], label='Predicted', linestyle='--')
ax.set_title(country)
# Show x-axis ticks every 3 years
ax.xaxis.set_major_locator(mdates.YearLocator(base=3))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
ax.tick_params(axis='x', rotation=45)
ax.grid(True, linestyle='--', linewidth=0.5, alpha=0.6)
# Add legend only once
if i == 0:
ax.legend()
# Remove unused subplots
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
fig.tight_layout()
fig.suptitle('Actual vs Predicted Monthly Overnights per Country (2019)', fontsize=16, y=1.02)
plt.show()
# df_plot: DataFrame with columns ['country', 'date', 'actual', 'predicted']
# List of countries
countries = df_plot['country'].unique()
results = []
for country in countries:
# Subset data for this country (test set only, if you want)
country_df = df_plot[(df_plot['country'] == country) & (df_plot['date'] > val_end)]
actual = country_df['actual'].values
pred = country_df['predicted'].values
if len(actual) == 0:
continue # Skip countries with no data in test set
mse = round(mean_squared_error(actual, pred),6)
rmse = np.sqrt(mse)
mae = mean_absolute_error(actual, pred)
mape = mean_absolute_percentage_error(actual, pred)
r2 = r2_score(actual, pred)
results.append({
'country': country,
'MSE': mse,
'RMSE': rmse,
'MAE': mae,
'MAPE': mape,
'R2': r2
})
# Convert to DataFrame for easy export or display
country_metrics = pd.DataFrame(results)
print(country_metrics)
country MSE RMSE MAE \
0 Austria 9.030588e+10 3.005094e+05 1.864868e+05
1 Belgium 2.222734e+08 1.490884e+04 9.771725e+03
2 Bosnia and Herzegovina 1.791651e+09 4.232790e+04 3.016650e+04
3 Canada 4.502302e+08 2.121863e+04 1.406842e+04
4 Czech Republic 9.575052e+10 3.094358e+05 1.572649e+05
5 Denmark 1.668924e+09 4.085246e+04 2.036179e+04
6 Finland 8.806166e+08 2.967519e+04 2.118418e+04
7 France 5.216737e+09 7.222698e+04 3.979571e+04
8 Germany 3.592724e+12 1.895448e+06 1.056744e+06
9 Hungary 1.180876e+10 1.086681e+05 5.517401e+04
10 Ireland 5.457368e+08 2.336101e+04 1.457357e+04
11 Italy 1.764677e+11 4.200806e+05 1.610925e+05
12 Netherlands 4.946827e+09 7.033368e+04 3.836726e+04
13 Norway 1.175707e+09 3.428858e+04 1.777885e+04
14 Poland 1.563322e+11 3.953887e+05 1.973553e+05
15 Romania 8.443926e+08 2.905843e+04 1.467024e+04
16 Slovakia 5.826269e+09 7.633000e+04 3.326159e+04
17 Slovenia 2.389104e+11 4.887847e+05 2.336568e+05
18 Spain 7.090350e+08 2.662771e+04 1.438300e+04
19 Sweden 1.700015e+09 4.123123e+04 2.305828e+04
20 Switzerland 2.423808e+09 4.923219e+04 2.420284e+04
21 USA 7.054939e+09 8.399368e+04 6.491786e+04
22 United Kingdom 5.882305e+10 2.425346e+05 1.640790e+05
MAPE R2
0 0.341512 0.793687
1 0.349829 0.982941
2 0.343102 0.951547
3 0.393229 0.633062
4 0.321398 0.778906
5 0.609540 0.932945
6 0.547492 0.515157
7 0.237299 0.906078
8 0.382247 0.148015
9 0.358141 0.928578
10 0.386605 0.666496
11 0.270358 0.680285
12 0.305097 0.965537
13 0.326562 0.937865
14 0.372516 0.721547
15 0.331823 0.700616
16 0.295391 0.967036
17 0.409152 0.673543
18 0.355399 0.829893
19 0.396644 0.938800
20 0.269205 0.814150
21 0.510810 0.336241
22 0.397067 0.557544
# Load and clean data
cool = pd.read_csv("C:/Users/Korisnik/Desktop/Škola/THESIS ideas/df_top_cool.csv")
cool.replace('..', np.nan, inplace=True)
cool['date'] = pd.to_datetime(cool[['year', 'month']].assign(day=1))
lag_cols = [col for col in cool.columns if 'lag' in col]
cool = cool.drop(columns=lag_cols, errors='ignore')
cool = cool[~cool['country'].isin(['Russian Federation'])]
# Convert types
cool['year'] = pd.to_numeric(cool['year'], downcast='integer', errors='coerce')
cool['month'] = pd.to_numeric(cool['month'], downcast='integer', errors='coerce')
cool['arrivals'] = pd.to_numeric(cool['arrivals'], errors='coerce')
cool['overnights'] = pd.to_numeric(cool['overnights'], errors='coerce')
cool['unemployment_rate'] = pd.to_numeric(cool['unemployment_rate'], errors='coerce')
cool['exchange_rate'] = pd.to_numeric(cool['exchange_rate'], errors='coerce')
cool['cpi'] = pd.to_numeric(cool['cpi'], errors='coerce')
cool['eu_member'] = cool['eu_member'].astype(int)
cool['euro_adopted'] = cool['euro_adopted'].astype(int)
# Merge Google Trends data
td = pd.read_csv("C:/Users/Korisnik/Desktop/Škola/THESIS ideas/multiTimeline.csv")
cool['date'] = pd.to_datetime(cool['date']).dt.to_period('M').dt.to_timestamp()
td['date'] = pd.to_datetime(td['date']).dt.to_period('M').dt.to_timestamp()
td_long = td.melt(id_vars='date', var_name='country', value_name='google_trends')
cool = cool.merge(td_long, on=['date', 'country'], how='left')
cool['google_trends'] = cool['google_trends'].fillna(-1)
# Sort and log-transform
cool = cool.sort_values(['country', 'date']).reset_index(drop=True)
cool = cool.drop_duplicates(subset=['country', 'date'], keep='first').reset_index(drop=True)
cool['arrivals'] = np.log1p(cool['arrivals'])
cool['overnights'] = np.log1p(cool['overnights'])
cool['exchange_rate'] = np.log1p(cool['exchange_rate'])
cool['cpi'] = np.log1p(cool['cpi'])
cool['arrivals_next_month'] = cool.groupby('country')['arrivals'].shift(-1)
# One-hot encode month
month_names = {i: month for i, month in enumerate(['January','February','March','April','May','June','July','August','September','October','November','December'], 1)}
cool['month_name'] = cool['month'].map(month_names)
ohe_month = pd.get_dummies(cool['month_name'], prefix='month').astype(int)
cool = pd.concat([cool, ohe_month], axis=1).drop(columns=['month_name'])
# Create lags
lags = {'arrivals': [1, 3, 6, 12], 'overnights': [1, 3, 6, 12], 'cpi': [1], 'unemployment_rate': [3], 'google_trends': [1, 3]}
for var, steps in lags.items():
for lag in steps:
cool[f'{var}_lag_{lag}'] = cool.groupby('country')[var].shift(lag)
# Step 1: Define Schengen entry years
schengen_entry_year = {
'Austria': 1995,
'Belgium': 1995,
'Czech Republic': 2007,
'Denmark': 2001,
'Finland': 1996,
'France': 1995,
'Germany': 1995,
'Hungary': 2007,
'Italy': 1997,
'Netherlands': 1995,
'Norway': 2001,
'Poland': 2007,
'Portugal': 1995,
'Slovakia': 2007,
'Slovenia': 2007,
'Spain': 1995,
'Sweden': 2001,
'Switzerland': 2008,
'Romania': 2024,}
def is_schengen_member(row):
entry_year = schengen_entry_year.get(row['country'], np.inf)
return int(row['year'] >= entry_year)
cool['schengen_member'] = cool.apply(is_schengen_member, axis=1).astype('int8')
# Filter and drop missing
cool.isna().sum().sort_values(ascending=False)
cool = cool[cool['date'] >= '2001-01-01']
cool = cool[cool['arrivals_next_month'].notna()].copy()
# Label encode country and scale numeric features
cool['country_encoded'] = LabelEncoder().fit_transform(cool['country'])
month_cols = [col for col in cool.columns if col.startswith('month_')]
X_numeric = cool[[
'unemployment_rate', 'exchange_rate', 'cpi_lag_1',
'arrivals_lag_1', 'arrivals_lag_3', 'arrivals_lag_6', 'arrivals_lag_12',
'google_trends_lag_1', 'google_trends_lag_3', 'schengen_member','euro_adopted'] + month_cols].values
X_country_array = cool['country_encoded'].astype('int32').values
y = cool['arrivals_next_month'].values
cool
| country | arrivals | overnights | date | year | month | month_sin | month_cos | eu_member | euro_adopted | ... | overnights_lag_1 | overnights_lag_3 | overnights_lag_6 | overnights_lag_12 | cpi_lag_1 | unemployment_rate_lag_3 | google_trends_lag_1 | google_trends_lag_3 | schengen_member | country_encoded | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 12 | Austria | 7.873217 | 9.102867 | 2001-01-01 | 2001 | 1 | 0.500000 | 8.660254e-01 | 1 | 1 | ... | 9.624303 | 11.286840 | 13.687496 | 9.077951 | 4.439116 | 3.6 | -1.0 | -1.0 | 1 | 0 |
| 13 | Austria | 8.664923 | 9.856815 | 2001-02-01 | 2001 | 2 | 0.866025 | 5.000000e-01 | 1 | 1 | ... | 9.102867 | 9.441055 | 13.771151 | 9.606159 | 4.440296 | 3.5 | -1.0 | -1.0 | 1 | 0 |
| 14 | Austria | 9.258368 | 10.391976 | 2001-03-01 | 2001 | 3 | 1.000000 | 6.120000e-17 | 1 | 1 | ... | 9.856815 | 9.624303 | 13.051318 | 10.189080 | 4.443827 | 3.8 | -1.0 | -1.0 | 1 | 0 |
| 15 | Austria | 10.645711 | 11.952940 | 2001-04-01 | 2001 | 4 | 0.866025 | -5.000000e-01 | 1 | 1 | ... | 10.391976 | 9.102867 | 11.286840 | 11.828816 | 4.445001 | 4.2 | -1.0 | -1.0 | 1 | 0 |
| 16 | Austria | 11.169632 | 12.518896 | 2001-05-01 | 2001 | 5 | 0.500000 | -8.660254e-01 | 1 | 1 | ... | 11.952940 | 9.856815 | 9.441055 | 12.243176 | 4.450853 | 4.5 | -1.0 | -1.0 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6892 | United Kingdom | 11.936578 | 13.523750 | 2024-07-01 | 2024 | 7 | -0.500000 | -8.660254e-01 | 0 | 0 | ... | 13.349341 | 11.997115 | 8.512783 | 13.465556 | 5.016617 | 4.3 | 47.0 | 35.0 | 0 | 22 |
| 6893 | United Kingdom | 12.015293 | 13.678767 | 2024-08-01 | 2024 | 8 | -0.866025 | -5.000000e-01 | 0 | 0 | ... | 13.523750 | 12.908799 | 9.019059 | 13.632791 | 5.014627 | 4.1 | 47.0 | 32.0 | 0 | 22 |
| 6894 | United Kingdom | 11.742989 | 13.317415 | 2024-09-01 | 2024 | 9 | -1.000000 | -1.840000e-16 | 0 | 0 | ... | 13.678767 | 13.349341 | 9.886138 | 13.254668 | 5.018603 | 4.2 | 41.0 | 47.0 | 0 | 22 |
| 6895 | United Kingdom | 10.992100 | 12.530613 | 2024-10-01 | 2024 | 10 | -0.866025 | 5.000000e-01 | 0 | 0 | ... | 13.317415 | 13.523750 | 11.997115 | 12.325113 | 5.017942 | 4.2 | 30.0 | 47.0 | 0 | 22 |
| 6896 | United Kingdom | 8.841593 | 9.982068 | 2024-11-01 | 2024 | 11 | -0.500000 | 8.660254e-01 | 0 | 0 | ... | 12.530613 | 13.678767 | 12.908799 | 9.631482 | 5.023881 | 4.4 | 19.0 | 41.0 | 0 | 22 |
6599 rows × 42 columns
cool.isna().sum()
country 0 arrivals 0 overnights 0 date 0 year 0 month 0 month_sin 0 month_cos 0 eu_member 0 euro_adopted 0 unemployment_rate 0 exchange_rate 0 industry_production 363 cpi 0 google_trends 0 arrivals_next_month 0 month_April 0 month_August 0 month_December 0 month_February 0 month_January 0 month_July 0 month_June 0 month_March 0 month_May 0 month_November 0 month_October 0 month_September 0 arrivals_lag_1 0 arrivals_lag_3 0 arrivals_lag_6 0 arrivals_lag_12 0 overnights_lag_1 0 overnights_lag_3 0 overnights_lag_6 0 overnights_lag_12 0 cpi_lag_1 0 unemployment_rate_lag_3 0 google_trends_lag_1 0 google_trends_lag_3 0 schengen_member 0 country_encoded 0 dtype: int64
# Time-based split
train_end = pd.Timestamp("2016-12-31")
val_end = pd.Timestamp("2020-12-31")
train_mask = cool['date'] <= train_end
val_mask = (cool['date'] > train_end) & (cool['date'] <= val_end)
test_mask = cool['date'] > val_end
scaler = StandardScaler()
X_num_train = scaler.fit_transform(X_numeric[train_mask])
X_num_val = scaler.transform(X_numeric[val_mask])
X_num_test = scaler.transform(X_numeric[test_mask])
X_numeric_scaled_all = scaler.transform(X_numeric)
X_cat_train = X_country_array[train_mask.to_numpy()]
X_cat_val = X_country_array[val_mask.to_numpy()]
X_cat_test = X_country_array[test_mask.to_numpy()]
y_train = y[train_mask.to_numpy()]
y_val = y[val_mask.to_numpy()]
y_test = y[test_mask.to_numpy()]
# Define model2
input_numeric = Input(shape=(X_num_train.shape[1],), name="numeric_input")
input_country = Input(shape=(1,), dtype='int32', name="country_input")
n_countries = cool['country_encoded'].nunique()
embedding = Embedding(input_dim=n_countries, output_dim=10)(input_country)
embedding_flat = Flatten()(embedding)
x = Concatenate()([input_numeric, embedding_flat])
x = Dense(64, activation='tanh')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Dense(32, activation='tanh')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
output = Dense(1)(x)
model2 = Model(inputs=[input_numeric, input_country], outputs=output)
model2.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
model2.summary()
# Train model2
early_stop = EarlyStopping(monitor='val_loss', patience=40, restore_best_weights=True)
history = model2.fit(
[X_num_train, X_cat_train], y_train,
validation_data=([X_num_val, X_cat_val], y_val),
epochs=100, batch_size=16, callbacks=[early_stop]
)
Model: "functional_37"
┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ Connected to ┃ ┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩ │ country_input │ (None, 1) │ 0 │ - │ │ (InputLayer) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ embedding_37 │ (None, 1, 10) │ 230 │ country_input[0]… │ │ (Embedding) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ numeric_input │ (None, 25) │ 0 │ - │ │ (InputLayer) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ flatten_37 │ (None, 10) │ 0 │ embedding_37[0][… │ │ (Flatten) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ concatenate_37 │ (None, 35) │ 0 │ numeric_input[0]… │ │ (Concatenate) │ │ │ flatten_37[0][0] │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dense_111 (Dense) │ (None, 64) │ 2,304 │ concatenate_37[0… │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ batch_normalizatio… │ (None, 64) │ 256 │ dense_111[0][0] │ │ (BatchNormalizatio… │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dropout_74 │ (None, 64) │ 0 │ batch_normalizat… │ │ (Dropout) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dense_112 (Dense) │ (None, 32) │ 2,080 │ dropout_74[0][0] │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ batch_normalizatio… │ (None, 32) │ 128 │ dense_112[0][0] │ │ (BatchNormalizatio… │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dropout_75 │ (None, 32) │ 0 │ batch_normalizat… │ │ (Dropout) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dense_113 (Dense) │ (None, 1) │ 33 │ dropout_75[0][0] │ └─────────────────────┴───────────────────┴────────────┴───────────────────┘
Total params: 5,031 (19.65 KB)
Trainable params: 4,839 (18.90 KB)
Non-trainable params: 192 (768.00 B)
Epoch 1/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 14s 14ms/step - loss: 67.6397 - val_loss: 24.2574 Epoch 2/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 16.7866 - val_loss: 2.7601 Epoch 3/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 8ms/step - loss: 3.6225 - val_loss: 2.3808 Epoch 4/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 4s 13ms/step - loss: 2.9429 - val_loss: 2.4379 Epoch 5/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 5s 10ms/step - loss: 2.4179 - val_loss: 2.2861 Epoch 6/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 3s 12ms/step - loss: 2.2089 - val_loss: 2.4861 Epoch 7/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 1.8824 - val_loss: 2.4279 Epoch 8/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 1.7382 - val_loss: 2.4234 Epoch 9/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 7s 24ms/step - loss: 1.6110 - val_loss: 2.4041 Epoch 10/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 6s 7ms/step - loss: 1.3417 - val_loss: 2.2658 Epoch 11/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 3s 7ms/step - loss: 1.2643 - val_loss: 2.2970 Epoch 12/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 3s 11ms/step - loss: 1.2401 - val_loss: 2.3647 Epoch 13/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 8s 18ms/step - loss: 1.1678 - val_loss: 2.3226 Epoch 14/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 8ms/step - loss: 1.1266 - val_loss: 2.3435 Epoch 15/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 1.1508 - val_loss: 2.2937 Epoch 16/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 1.1442 - val_loss: 2.3006 Epoch 17/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 1.0613 - val_loss: 2.3334 Epoch 18/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 1.0120 - val_loss: 2.3491 Epoch 19/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 1.0003 - val_loss: 2.3213 Epoch 20/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.9837 - val_loss: 2.3538 Epoch 21/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 5ms/step - loss: 0.9836 - val_loss: 2.3499 Epoch 22/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.9727 - val_loss: 2.3663 Epoch 23/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.9948 - val_loss: 2.3869 Epoch 24/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 0.9877 - val_loss: 2.3571 Epoch 25/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.9401 - val_loss: 2.3121 Epoch 26/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 3s 7ms/step - loss: 0.9512 - val_loss: 2.4112 Epoch 27/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.9802 - val_loss: 2.3515 Epoch 28/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.9385 - val_loss: 2.4328 Epoch 29/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.9237 - val_loss: 2.3702 Epoch 30/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.9208 - val_loss: 2.4257 Epoch 31/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.9504 - val_loss: 2.4614 Epoch 32/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 8ms/step - loss: 0.9075 - val_loss: 2.3074 Epoch 33/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 0.8661 - val_loss: 2.2357 Epoch 34/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 0.8901 - val_loss: 2.3487 Epoch 35/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 5s 17ms/step - loss: 0.8557 - val_loss: 2.4129 Epoch 36/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 4s 14ms/step - loss: 0.8772 - val_loss: 2.3710 Epoch 37/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 0.8398 - val_loss: 2.4213 Epoch 38/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 0.8535 - val_loss: 2.3572 Epoch 39/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.8696 - val_loss: 2.3496 Epoch 40/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 0.8352 - val_loss: 2.2743 Epoch 41/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 0.8381 - val_loss: 2.4121 Epoch 42/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 5ms/step - loss: 0.8806 - val_loss: 2.2995 Epoch 43/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.8205 - val_loss: 2.3153 Epoch 44/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 5ms/step - loss: 0.8615 - val_loss: 2.3236 Epoch 45/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.8250 - val_loss: 2.2614 Epoch 46/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.8223 - val_loss: 2.3344 Epoch 47/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.8433 - val_loss: 2.3264 Epoch 48/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.8229 - val_loss: 2.2402 Epoch 49/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.7742 - val_loss: 2.3089 Epoch 50/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.7955 - val_loss: 2.2734 Epoch 51/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.7981 - val_loss: 2.2787 Epoch 52/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.7627 - val_loss: 2.2760 Epoch 53/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 3s 9ms/step - loss: 0.8173 - val_loss: 2.3125 Epoch 54/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 5ms/step - loss: 0.8277 - val_loss: 2.2606 Epoch 55/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 5ms/step - loss: 0.7665 - val_loss: 2.3070 Epoch 56/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.7709 - val_loss: 2.2386 Epoch 57/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 3s 4ms/step - loss: 0.7876 - val_loss: 2.3755 Epoch 58/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.7985 - val_loss: 2.2257 Epoch 59/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.7697 - val_loss: 2.3401 Epoch 60/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.7739 - val_loss: 2.2278 Epoch 61/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.7652 - val_loss: 2.3139 Epoch 62/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.7971 - val_loss: 2.2271 Epoch 63/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 0.7472 - val_loss: 2.2477 Epoch 64/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 0.7693 - val_loss: 2.2629 Epoch 65/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 3s 10ms/step - loss: 0.7221 - val_loss: 2.2515 Epoch 66/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 5s 7ms/step - loss: 0.7554 - val_loss: 2.2968 Epoch 67/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 0.7474 - val_loss: 2.2501 Epoch 68/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.7336 - val_loss: 2.1876 Epoch 69/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.6973 - val_loss: 2.2026 Epoch 70/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 5ms/step - loss: 0.6985 - val_loss: 2.2146 Epoch 71/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 3s 5ms/step - loss: 0.7471 - val_loss: 2.2008 Epoch 72/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.7348 - val_loss: 2.2450 Epoch 73/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 0.7008 - val_loss: 2.1541 Epoch 74/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 0.7303 - val_loss: 2.1802 Epoch 75/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 8ms/step - loss: 0.7044 - val_loss: 2.2246 Epoch 76/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 3s 7ms/step - loss: 0.7006 - val_loss: 2.2524 Epoch 77/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.7326 - val_loss: 2.2514 Epoch 78/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 0.6666 - val_loss: 2.2973 Epoch 79/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 0.6815 - val_loss: 2.1639 Epoch 80/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.6999 - val_loss: 2.2235 Epoch 81/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 3s 7ms/step - loss: 0.6720 - val_loss: 2.2327 Epoch 82/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.7127 - val_loss: 2.2349 Epoch 83/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 5ms/step - loss: 0.6997 - val_loss: 2.2009 Epoch 84/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.6761 - val_loss: 2.1925 Epoch 85/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.6760 - val_loss: 2.2075 Epoch 86/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.6772 - val_loss: 2.2640 Epoch 87/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 5ms/step - loss: 0.6492 - val_loss: 2.2148 Epoch 88/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 0.6594 - val_loss: 2.2313 Epoch 89/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 0.6871 - val_loss: 2.1818 Epoch 90/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 5ms/step - loss: 0.6710 - val_loss: 2.1486 Epoch 91/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.6750 - val_loss: 2.2075 Epoch 92/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 8ms/step - loss: 0.6639 - val_loss: 2.1997 Epoch 93/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.6333 - val_loss: 2.1981 Epoch 94/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 8ms/step - loss: 0.6760 - val_loss: 2.1814 Epoch 95/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.6528 - val_loss: 2.2041 Epoch 96/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 5ms/step - loss: 0.6243 - val_loss: 2.2046 Epoch 97/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.6439 - val_loss: 2.2165 Epoch 98/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.6469 - val_loss: 2.1735 Epoch 99/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.6199 - val_loss: 2.1718 Epoch 100/100 276/276 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.6182 - val_loss: 2.2016
# Baseline predictions
baseline_preds = model2.predict([X_num_test, X_cat_test]).flatten()
baseline_mse = mean_squared_error(y_test, baseline_preds)
feature_names = [
'unemployment_rate', 'exchange_rate', 'cpi_lag_1',
'arrivals_lag_1', 'arrivals_lag_3', 'arrivals_lag_6', 'arrivals_lag_12',
'google_trends_lag_1', 'google_trends_lag_3', 'schengen_member', 'euro_adopted']+month_cols
importances = []
for i in range(X_num_test.shape[1]):
X_permuted = X_num_test.copy()
X_permuted[:, i] = np.random.permutation(X_permuted[:, i])
permuted_preds = model2.predict([X_permuted, X_cat_test]).flatten()
permuted_mse = mean_squared_error(y_test, permuted_preds)
importances.append(permuted_mse - baseline_mse)
#Handle grouped month dummies so its not one bar for each month
month_indices = [i for i, f in enumerate(feature_names) if f.startswith('month_')]
if month_indices:
# Safe grouped permutation
X_month_permuted = X_num_test.copy()
row_perm = np.random.permutation(len(X_month_permuted))
X_month_permuted[:, month_indices] = X_month_permuted[row_perm][:, month_indices]
month_preds = model2.predict([X_month_permuted, X_cat_test]).flatten()
month_importance = mean_squared_error(y_test, month_preds) - baseline_mse
else:
month_importance = 0 # fallback if no month cols present
filtered_names = [f for i, f in enumerate(feature_names) if i not in month_indices]
filtered_importances = [imp for i, imp in enumerate(importances) if i not in month_indices]
feature_names_final = filtered_names + ['month_group']
importances_final = filtered_importances + [month_importance]
sorted_pairs = sorted(zip(importances_final, feature_names_final), reverse=True)
importances_sorted, feature_names_sorted = zip(*sorted_pairs)
plt.figure(figsize=(12, 6))
bars = plt.barh(feature_names_sorted, importances_sorted)
plt.xlabel("Increase in MSE when shuffled")
plt.title("Model 2: Arrivals (2024)")
plt.gca().invert_yaxis()
# Add labels
for bar in bars:
width = bar.get_width()
plt.text(width + 0.001, bar.get_y() + bar.get_height() / 2,
f"{width:.4f}", va='center')
plt.tight_layout()
plt.show()
34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step
## Model evalauation
# Predict and flatten
train_preds = model2.predict([X_num_train, X_cat_train]).flatten()
test_preds = model2.predict([X_num_test, X_cat_test]).flatten()
# MSE
train_mse = mean_squared_error(y_train, train_preds)
test_mse = mean_squared_error(y_test, test_preds)
# RMSE
train_rmse2 = np.sqrt(train_mse)
test_rmse2 = np.sqrt(test_mse)
#MAE
train_mae2 = mean_absolute_error(y_train, train_preds)
test_mae2 = mean_absolute_error(y_test, test_preds)
# MAPE
train_mape2 = mean_absolute_percentage_error(y_train, train_preds)
test_mape2 = mean_absolute_percentage_error(y_test, test_preds)
# R-squared
train_r2_2 = r2_score(y_train, train_preds)
test_r2_2 = r2_score(y_test, test_preds)
# Print all metrics
print(f"Train MSE: {train_mse:.4f}, RMSE: {train_rmse2:.4f}, MAE: {train_mae2:.4f}, MAPE: {train_mape2:.4f}, R²: {train_r2_2:.4f}")
print(f"Test MSE: {test_mse:.4f}, RMSE: {test_rmse2:.4f}, MAE: {test_mae2:.4f}, MAPE: {test_mape2:.4f}, R²: {test_r2_2:.4f}")
# Plot training history
plt.figure(figsize=(10, 4))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training History')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.legend()
plt.grid(True)
plt.show()
138/138 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step 34/34 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step Train MSE: 0.0749, RMSE: 0.2738, MAE: 0.2059, MAPE: 0.0257, R²: 0.9807 Test MSE: 0.4215, RMSE: 0.6492, MAE: 0.4911, MAPE: 0.0579, R²: 0.8919
# Global trend plot (total actual vs predicted per month)
y_pred_all = model2.predict([X_numeric_scaled_all, X_country_array]).flatten()
y_actual_all = np.expm1(y)
y_pred_all = np.expm1(y_pred_all)
df_plot = cool.copy()
df_plot['actual'] = y_actual_all
df_plot['predicted'] = y_pred_all
monthly_totals_all = df_plot.groupby('date')[['actual', 'predicted']].sum()
plt.figure(figsize=(16, 6))
plt.plot(monthly_totals_all.index, monthly_totals_all['actual'], label='Total Actual', linewidth=2)
plt.plot(monthly_totals_all.index, monthly_totals_all['predicted'], label='Total Predicted', linestyle='--', alpha=0.8)
plt.title("Total Monthly Arrivals (2024)")
plt.xlabel("Date")
plt.ylabel("Total Arrivals")
plt.xticks(rotation=90)
plt.ticklabel_format(style='plain', axis='y')
plt.legend()
plt.tight_layout()
ax = plt.gca()
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=6))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.xticks(rotation=45)
plt.xlim(monthly_totals_all.index.min(), monthly_totals_all.index.max())
plt.axvline(x=train_end, color='gray', linestyle='--', linewidth=1.5, label='Train/Val Split')
plt.axvline(x=val_end, color='black', linestyle='--', linewidth=1.5, label='Val/Test Split')
plt.text(train_end, plt.ylim()[1]*0.95, 'Training set', rotation=90, ha='right', va='top', color='gray')
plt.text(val_end, plt.ylim()[1]*0.95, 'Testing set', rotation=90, ha='right', va='top', color='black')
plt.grid(True, linestyle='--', linewidth=0.5, alpha=0.6)
plt.show()
207/207 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step
# Country-level plots
countries = df_plot['country'].unique()
fig, axes = plt.subplots(len(countries) // 3 + 1, 3, figsize=(18, 3 * (len(countries) // 3 + 1)), sharex=False, sharey=False)
axes = axes.flatten()
for i, country in enumerate(countries):
country_df = df_plot[df_plot['country'] == country].groupby('date')[['actual', 'predicted']].sum()
ax = axes[i]
ax.plot(country_df.index, country_df['actual'], label='Actual')
ax.plot(country_df.index, country_df['predicted'], label='Predicted', linestyle='--')
ax.set_title(country)
# Show x-axis ticks every 3 years
ax.xaxis.set_major_locator(mdates.YearLocator(base=3))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
ax.tick_params(axis='x', rotation=45)
ax.grid(True, linestyle='--', linewidth=0.5, alpha=0.6)
# Add legend only once
if i == 0:
ax.legend()
# Remove unused subplots
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
fig.tight_layout()
fig.suptitle('Actual vs Predicted Monthly Arrivals per Country (2024)', fontsize=16, y=1.02)
plt.show()
# df_plot: DataFrame with columns ['country', 'date', 'actual', 'predicted']
# List of countries
countries = df_plot['country'].unique()
results = []
for country in countries:
# Subset data for this country (test set only, if you want)
country_df = df_plot[(df_plot['country'] == country) & (df_plot['date'] > val_end)]
actual = country_df['actual'].values
pred = country_df['predicted'].values
if len(actual) == 0:
continue # Skip countries with no data in test set
mse = round(mean_squared_error(actual, pred),6)
rmse = np.sqrt(mse)
mae = mean_absolute_error(actual, pred)
mape = mean_absolute_percentage_error(actual, pred)
r2 = r2_score(actual, pred)
results.append({
'country': country,
'MSE': mse,
'RMSE': rmse,
'MAE': mae,
'MAPE': mape,
'R2': r2
})
# Convert to DataFrame for easy export or display
country_metrics = pd.DataFrame(results)
print(country_metrics)
country MSE RMSE MAE \
0 Austria 5.081636e+09 71285.595256 49918.328535
1 Belgium 6.537096e+07 8085.231123 4196.487165
2 Bosnia and Herzegovina 1.642307e+08 12815.252438 8874.936591
3 Canada 1.947356e+07 4412.885448 2575.362746
4 Czech Republic 1.393131e+09 37324.668759 19292.966149
5 Denmark 1.315828e+08 11470.952964 4718.637211
6 Finland 1.638380e+07 4047.691017 2332.584180
7 France 4.337233e+08 20826.025097 12390.277720
8 Germany 7.127464e+10 266973.101306 161100.764622
9 Hungary 1.225432e+09 35006.166906 18925.231006
10 Ireland 3.426333e+07 5853.489093 3640.333267
11 Italy 1.647708e+09 40591.974631 20378.707260
12 Netherlands 6.173665e+08 24846.861121 12934.523989
13 Norway 6.668156e+07 8165.878025 3941.330346
14 Poland 3.940971e+09 62777.156710 35559.869517
15 Romania 3.460065e+07 5882.232142 3181.175834
16 Slovakia 1.441173e+08 12004.884555 6607.882405
17 Slovenia 2.817913e+09 53084.015108 37657.455940
18 Spain 5.689977e+07 7543.193539 4248.567497
19 Sweden 5.434318e+07 7371.782476 3788.066846
20 Switzerland 2.199884e+08 14832.006118 8522.123304
21 USA 3.815869e+08 19534.250033 13980.165553
22 United Kingdom 8.179276e+08 28599.433484 17370.251709
MAPE R2
0 0.574757 0.617748
1 0.581596 0.851310
2 0.408353 0.854232
3 0.614233 0.829186
4 0.311453 0.848878
5 0.522899 0.700217
6 0.762202 0.516292
7 0.414942 0.834006
8 0.527543 0.221635
9 0.459027 0.787361
10 0.653992 0.523599
11 0.382417 0.811186
12 0.397557 0.803672
13 0.792227 0.571320
14 0.477038 0.710003
15 0.390700 0.815610
16 0.313489 0.961850
17 0.549796 0.772319
18 0.401895 0.862284
19 0.472303 0.882823
20 0.345093 0.620763
21 0.455143 0.801948
22 0.743835 0.738716
# Load and clean data
cool = pd.read_csv("C:/Users/Korisnik/Desktop/Škola/THESIS ideas/df_top_cool.csv")
cool.replace('..', np.nan, inplace=True)
cool['date'] = pd.to_datetime(cool[['year', 'month']].assign(day=1))
lag_cols = [col for col in cool.columns if 'lag' in col]
cool = cool.drop(columns=lag_cols, errors='ignore')
cool = cool[~cool['country'].isin(['Russian Federation'])]
# Convert types
cool['year'] = pd.to_numeric(cool['year'], downcast='integer', errors='coerce')
cool['month'] = pd.to_numeric(cool['month'], downcast='integer', errors='coerce')
cool['arrivals'] = pd.to_numeric(cool['arrivals'], errors='coerce')
cool['overnights'] = pd.to_numeric(cool['overnights'], errors='coerce')
cool['unemployment_rate'] = pd.to_numeric(cool['unemployment_rate'], errors='coerce')
cool['exchange_rate'] = pd.to_numeric(cool['exchange_rate'], errors='coerce')
cool['cpi'] = pd.to_numeric(cool['cpi'], errors='coerce')
cool['eu_member'] = cool['eu_member'].astype(int)
cool['euro_adopted'] = cool['euro_adopted'].astype(int)
# Merge Google Trends data
td = pd.read_csv("C:/Users/Korisnik/Desktop/Škola/THESIS ideas/multiTimeline.csv")
cool['date'] = pd.to_datetime(cool['date']).dt.to_period('M').dt.to_timestamp()
td['date'] = pd.to_datetime(td['date']).dt.to_period('M').dt.to_timestamp()
td_long = td.melt(id_vars='date', var_name='country', value_name='google_trends')
cool = cool.merge(td_long, on=['date', 'country'], how='left')
cool['google_trends'] = cool['google_trends'].fillna(-1)
# Sort and log-transform
cool = cool.sort_values(['country', 'date']).reset_index(drop=True)
cool = cool.drop_duplicates(subset=['country', 'date'], keep='first').reset_index(drop=True)
cool['arrivals'] = np.log1p(cool['arrivals'])
cool['overnights'] = np.log1p(cool['overnights'])
cool['exchange_rate'] = np.log1p(cool['exchange_rate'])
cool['cpi'] = np.log1p(cool['cpi'])
cool['arrivals_next_month'] = cool.groupby('country')['arrivals'].shift(-1)
# One-hot encode month
month_names = {i: month for i, month in enumerate(['January','February','March','April','May','June','July','August','September','October','November','December'], 1)}
cool['month_name'] = cool['month'].map(month_names)
ohe_month = pd.get_dummies(cool['month_name'], prefix='month').astype(int)
cool = pd.concat([cool, ohe_month], axis=1).drop(columns=['month_name'])
# Create lags
lags = {'arrivals': [1, 3, 6, 12], 'overnights': [1, 3, 6, 12], 'cpi': [1], 'unemployment_rate': [3], 'google_trends': [1, 3]}
for var, steps in lags.items():
for lag in steps:
cool[f'{var}_lag_{lag}'] = cool.groupby('country')[var].shift(lag)
# Step 1: Define Schengen entry years
schengen_entry_year = {
'Austria': 1995,
'Belgium': 1995,
'Czech Republic': 2007,
'Denmark': 2001,
'Finland': 1996,
'France': 1995,
'Germany': 1995,
'Hungary': 2007,
'Italy': 1997,
'Netherlands': 1995,
'Norway': 2001,
'Poland': 2007,
'Portugal': 1995,
'Slovakia': 2007,
'Slovenia': 2007,
'Spain': 1995,
'Sweden': 2001,
'Switzerland': 2008,
'Romania': 2024,}
def is_schengen_member(row):
entry_year = schengen_entry_year.get(row['country'], np.inf)
return int(row['year'] >= entry_year)
cool['schengen_member'] = cool.apply(is_schengen_member, axis=1).astype('int8')
# Filter and drop missing
cool.isna().sum().sort_values(ascending=False)
cool = cool[cool['date'] >= '2001-01-01']
cool = cool[cool['date'] <= '2019-12-01']
cool = cool[cool['arrivals_next_month'].notna()].copy()
# Label encode country and scale numeric features
cool['country_encoded'] = LabelEncoder().fit_transform(cool['country'])
month_cols = [col for col in cool.columns if col.startswith('month_')]
X_numeric = cool[[
'unemployment_rate', 'exchange_rate', 'cpi_lag_1',
'arrivals_lag_1', 'arrivals_lag_3', 'arrivals_lag_6', 'arrivals_lag_12',
'google_trends_lag_1', 'google_trends_lag_3', 'schengen_member','euro_adopted'] + month_cols].values
X_country_array = cool['country_encoded'].astype('int32').values
y = cool['arrivals_next_month'].values
# Time-based split
train_end = pd.Timestamp("2014-12-31")
val_end = pd.Timestamp("2017-12-31")
train_mask = cool['date'] <= train_end
val_mask = (cool['date'] > train_end) & (cool['date'] <= val_end)
test_mask = cool['date'] > val_end
scaler = StandardScaler()
X_num_train = scaler.fit_transform(X_numeric[train_mask])
X_num_val = scaler.transform(X_numeric[val_mask])
X_num_test = scaler.transform(X_numeric[test_mask])
X_numeric_scaled_all = scaler.transform(X_numeric)
X_cat_train = X_country_array[train_mask.to_numpy()]
X_cat_val = X_country_array[val_mask.to_numpy()]
X_cat_test = X_country_array[test_mask.to_numpy()]
y_train = y[train_mask.to_numpy()]
y_val = y[val_mask.to_numpy()]
y_test = y[test_mask.to_numpy()]
# Define model4
input_numeric = Input(shape=(X_num_train.shape[1],), name="numeric_input")
input_country = Input(shape=(1,), dtype='int32', name="country_input")
n_countries = cool['country_encoded'].nunique()
embedding = Embedding(input_dim=n_countries, output_dim=10)(input_country)
embedding_flat = Flatten()(embedding)
x = Concatenate()([input_numeric, embedding_flat])
x = Dense(64, activation='tanh')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Dense(32, activation='tanh')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
output = Dense(1)(x)
model1 = Model(inputs=[input_numeric, input_country], outputs=output)
model1.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
model1.summary()
# Train mode4
early_stop = EarlyStopping(monitor='val_loss', patience=40, restore_best_weights=True)
history = model1.fit(
[X_num_train, X_cat_train], y_train,
validation_data=([X_num_val, X_cat_val], y_val),
epochs=100, batch_size=16, callbacks=[early_stop]
)
Model: "functional_30"
┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ Connected to ┃ ┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩ │ country_input │ (None, 1) │ 0 │ - │ │ (InputLayer) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ embedding_30 │ (None, 1, 10) │ 230 │ country_input[0]… │ │ (Embedding) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ numeric_input │ (None, 25) │ 0 │ - │ │ (InputLayer) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ flatten_30 │ (None, 10) │ 0 │ embedding_30[0][… │ │ (Flatten) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ concatenate_30 │ (None, 35) │ 0 │ numeric_input[0]… │ │ (Concatenate) │ │ │ flatten_30[0][0] │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dense_90 (Dense) │ (None, 64) │ 2,304 │ concatenate_30[0… │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ batch_normalizatio… │ (None, 64) │ 256 │ dense_90[0][0] │ │ (BatchNormalizatio… │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dropout_60 │ (None, 64) │ 0 │ batch_normalizat… │ │ (Dropout) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dense_91 (Dense) │ (None, 32) │ 2,080 │ dropout_60[0][0] │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ batch_normalizatio… │ (None, 32) │ 128 │ dense_91[0][0] │ │ (BatchNormalizatio… │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dropout_61 │ (None, 32) │ 0 │ batch_normalizat… │ │ (Dropout) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dense_92 (Dense) │ (None, 1) │ 33 │ dropout_61[0][0] │ └─────────────────────┴───────────────────┴────────────┴───────────────────┘
Total params: 5,031 (19.65 KB)
Trainable params: 4,839 (18.90 KB)
Non-trainable params: 192 (768.00 B)
Epoch 1/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 9s 8ms/step - loss: 67.7775 - val_loss: 31.6445 Epoch 2/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 22.3988 - val_loss: 1.5322 Epoch 3/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 3.8682 - val_loss: 0.7337 Epoch 4/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 3.3002 - val_loss: 0.5032 Epoch 5/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 2.6980 - val_loss: 0.4413 Epoch 6/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 2.3971 - val_loss: 0.3308 Epoch 7/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 2.0858 - val_loss: 0.3201 Epoch 8/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.9076 - val_loss: 0.3460 Epoch 9/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 3s 9ms/step - loss: 1.8606 - val_loss: 0.3169 Epoch 10/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.6263 - val_loss: 0.3324 Epoch 11/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.5082 - val_loss: 0.2692 Epoch 12/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 1.3668 - val_loss: 0.2613 Epoch 13/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 1.4066 - val_loss: 0.3108 Epoch 14/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.2721 - val_loss: 0.2947 Epoch 15/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 1.3073 - val_loss: 0.2788 Epoch 16/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 3s 8ms/step - loss: 1.1615 - val_loss: 0.2917 Epoch 17/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 1.2453 - val_loss: 0.2769 Epoch 18/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 1.1692 - val_loss: 0.2724 Epoch 19/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 4s 11ms/step - loss: 1.1334 - val_loss: 0.3020 Epoch 20/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.1748 - val_loss: 0.2422 Epoch 21/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 1.1109 - val_loss: 0.2975 Epoch 22/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 1.0664 - val_loss: 0.3064 Epoch 23/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.1109 - val_loss: 0.2958 Epoch 24/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.0124 - val_loss: 0.2453 Epoch 25/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.0531 - val_loss: 0.2697 Epoch 26/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.0846 - val_loss: 0.2782 Epoch 27/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.0858 - val_loss: 0.2751 Epoch 28/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 1.0140 - val_loss: 0.2530 Epoch 29/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.0381 - val_loss: 0.2716 Epoch 30/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.9978 - val_loss: 0.3025 Epoch 31/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 1.0015 - val_loss: 0.2562 Epoch 32/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.9500 - val_loss: 0.2932 Epoch 33/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 0.9884 - val_loss: 0.2509 Epoch 34/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.9633 - val_loss: 0.2902 Epoch 35/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.9817 - val_loss: 0.3144 Epoch 36/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.9323 - val_loss: 0.2615 Epoch 37/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.9170 - val_loss: 0.2546 Epoch 38/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.9498 - val_loss: 0.2824 Epoch 39/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.9296 - val_loss: 0.2132 Epoch 40/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.9895 - val_loss: 0.2419 Epoch 41/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 4s 10ms/step - loss: 0.9238 - val_loss: 0.3262 Epoch 42/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.9853 - val_loss: 0.2984 Epoch 43/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 0.9708 - val_loss: 0.2598 Epoch 44/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 4s 11ms/step - loss: 0.9413 - val_loss: 0.2332 Epoch 45/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.9508 - val_loss: 0.2466 Epoch 46/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.8969 - val_loss: 0.2490 Epoch 47/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.9293 - val_loss: 0.2760 Epoch 48/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 3s 7ms/step - loss: 0.9200 - val_loss: 0.2497 Epoch 49/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.9380 - val_loss: 0.2382 Epoch 50/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.9109 - val_loss: 0.2762 Epoch 51/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.9188 - val_loss: 0.2745 Epoch 52/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.9029 - val_loss: 0.2906 Epoch 53/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 3s 8ms/step - loss: 0.9281 - val_loss: 0.2222 Epoch 54/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 9ms/step - loss: 0.8881 - val_loss: 0.2893 Epoch 55/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.8973 - val_loss: 0.2331 Epoch 56/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 0.8678 - val_loss: 0.2558 Epoch 57/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 4s 12ms/step - loss: 0.8569 - val_loss: 0.2376 Epoch 58/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.8497 - val_loss: 0.2230 Epoch 59/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.8584 - val_loss: 0.2637 Epoch 60/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.8452 - val_loss: 0.2468 Epoch 61/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.8408 - val_loss: 0.2767 Epoch 62/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.8325 - val_loss: 0.2766 Epoch 63/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 0.8907 - val_loss: 0.3192 Epoch 64/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 4s 10ms/step - loss: 0.8712 - val_loss: 0.1979 Epoch 65/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 0.8256 - val_loss: 0.2884 Epoch 66/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.8836 - val_loss: 0.2676 Epoch 67/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.8484 - val_loss: 0.2426 Epoch 68/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 4s 9ms/step - loss: 0.8475 - val_loss: 0.2432 Epoch 69/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.8207 - val_loss: 0.2640 Epoch 70/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.8451 - val_loss: 0.2363 Epoch 71/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 0.7694 - val_loss: 0.2579 Epoch 72/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.8216 - val_loss: 0.3550 Epoch 73/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.8074 - val_loss: 0.2073 Epoch 74/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.8457 - val_loss: 0.2140 Epoch 75/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.7992 - val_loss: 0.2816 Epoch 76/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 0.8139 - val_loss: 0.2271 Epoch 77/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 0.7944 - val_loss: 0.2203 Epoch 78/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.8483 - val_loss: 0.2180 Epoch 79/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 4s 9ms/step - loss: 0.8018 - val_loss: 0.2443 Epoch 80/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.8288 - val_loss: 0.2461 Epoch 81/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.8137 - val_loss: 0.2634 Epoch 82/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.7978 - val_loss: 0.2523 Epoch 83/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 4s 12ms/step - loss: 0.7900 - val_loss: 0.2177 Epoch 84/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 4s 5ms/step - loss: 0.7369 - val_loss: 0.2790 Epoch 85/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 0.8106 - val_loss: 0.2041 Epoch 86/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.7447 - val_loss: 0.2484 Epoch 87/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 3s 8ms/step - loss: 0.7948 - val_loss: 0.2330 Epoch 88/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.7684 - val_loss: 0.2200 Epoch 89/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.7770 - val_loss: 0.2393 Epoch 90/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.8086 - val_loss: 0.1829 Epoch 91/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.7575 - val_loss: 0.1823 Epoch 92/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.7621 - val_loss: 0.1903 Epoch 93/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step - loss: 0.7652 - val_loss: 0.2086 Epoch 94/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.7786 - val_loss: 0.2223 Epoch 95/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.7741 - val_loss: 0.2094 Epoch 96/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 2s 7ms/step - loss: 0.7545 - val_loss: 0.1820 Epoch 97/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 0.7618 - val_loss: 0.2086 Epoch 98/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.7097 - val_loss: 0.1998 Epoch 99/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 0.7333 - val_loss: 0.2348 Epoch 100/100 242/242 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.7541 - val_loss: 0.2034
# Baseline predictions
baseline_preds = model1.predict([X_num_test, X_cat_test]).flatten()
baseline_mse = mean_squared_error(y_test, baseline_preds)
feature_names = [
'unemployment_rate', 'exchange_rate', 'cpi_lag_1',
'arrivals_lag_1', 'arrivals_lag_3', 'arrivals_lag_6', 'arrivals_lag_12',
'google_trends_lag_1', 'google_trends_lag_3', 'schengen_member', 'euro_adopted']+month_cols
importances = []
for i in range(X_num_test.shape[1]):
X_permuted = X_num_test.copy()
X_permuted[:, i] = np.random.permutation(X_permuted[:, i])
permuted_preds = model1.predict([X_permuted, X_cat_test]).flatten()
permuted_mse = mean_squared_error(y_test, permuted_preds)
importances.append(max(0, permuted_mse - baseline_mse))
#Handle grouped month dummies so its not one bar for each month
month_indices = [i for i, f in enumerate(feature_names) if f.startswith('month_')]
if month_indices:
# Safe grouped permutation
X_month_permuted = X_num_test.copy()
row_perm = np.random.permutation(len(X_month_permuted))
X_month_permuted[:, month_indices] = X_month_permuted[row_perm][:, month_indices]
month_preds = model1.predict([X_month_permuted, X_cat_test]).flatten()
month_importance = max(0, mean_squared_error(y_test, month_preds) - baseline_mse)
else:
month_importance = 0 # fallback if no month cols present
filtered_names = [f for i, f in enumerate(feature_names) if i not in month_indices]
filtered_importances = [imp for i, imp in enumerate(importances) if i not in month_indices]
feature_names_final = filtered_names + ['month_group']
importances_final = filtered_importances + [month_importance]
sorted_pairs = sorted(zip(importances_final, feature_names_final), reverse=True)
importances_sorted, feature_names_sorted = zip(*sorted_pairs)
plt.figure(figsize=(12, 6))
bars = plt.barh(feature_names_sorted, importances_sorted)
plt.xlabel("Increase in MSE when shuffled")
plt.title("Model 1: Arrivals (2019)")
plt.gca().invert_yaxis()
# Add labels
for bar in bars:
width = bar.get_width()
plt.text(width + 0.001, bar.get_y() + bar.get_height() / 2,
f"{width:.4f}", va='center')
plt.tight_layout()
plt.show()
18/18 ━━━━━━━━━━━━━━━━━━━━ 2s 58ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 1s 19ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 23ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step
## Model evalauation
# Predict and flatten
train_preds = model1.predict([X_num_train, X_cat_train]).flatten()
test_preds = model1.predict([X_num_test, X_cat_test]).flatten()
# MSE
train_mse = mean_squared_error(y_train, train_preds)
test_mse = mean_squared_error(y_test, test_preds)
# RMSE
train_rmse1 = np.sqrt(train_mse)
test_rmse1 = np.sqrt(test_mse)
#MAE
train_mae1 = mean_absolute_error(y_train, train_preds)
test_mae1 = mean_absolute_error(y_test, test_preds)
# MAPE
train_mape1 = mean_absolute_percentage_error(y_train, train_preds)
test_mape1 = mean_absolute_percentage_error(y_test, test_preds)
# R-squared
train_r2_1 = r2_score(y_train, train_preds)
test_r2_1 = r2_score(y_test, test_preds)
# Print all metrics
print(f"Train MSE: {train_mse:.4f}, RMSE: {train_rmse1:.4f}, MAE: {train_mae1:.4f}, MAPE: {train_mape1:.4f}, R²: {train_r2_1:.4f}")
print(f"Test MSE: {test_mse:.4f}, RMSE: {test_rmse1:.4f}, MAE: {test_mae1:.4f}, MAPE: {test_mape1:.4f}, R²: {test_r2_1:.4f}")
# Plot training history
plt.figure(figsize=(10, 4))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training History')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.legend()
plt.grid(True)
plt.show()
121/121 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step Train MSE: 0.0857, RMSE: 0.2927, MAE: 0.2219, MAPE: 0.0277, R²: 0.9779 Test MSE: 0.4103, RMSE: 0.6405, MAE: 0.4875, MAPE: 0.0546, R²: 0.8716
# Global trend plot (total actual vs predicted per month)
y_pred_all = model1.predict([X_numeric_scaled_all, X_country_array]).flatten()
y_actual_all = np.expm1(y)
y_pred_all = np.expm1(y_pred_all)
df_plot = cool.copy()
df_plot['actual'] = y_actual_all
df_plot['predicted'] = y_pred_all
monthly_totals_all = df_plot.groupby('date')[['actual', 'predicted']].sum()
plt.figure(figsize=(16, 6))
plt.plot(monthly_totals_all.index, monthly_totals_all['actual'], label='Total Actual', linewidth=2)
plt.plot(monthly_totals_all.index, monthly_totals_all['predicted'], label='Total Predicted', linestyle='--', alpha=0.8)
plt.title("Total Monthly Arrivals (2019)")
plt.xlabel("Date")
plt.ylabel("Total Overnights")
plt.xticks(rotation=90)
plt.ticklabel_format(style='plain', axis='y')
plt.legend()
plt.tight_layout()
ax = plt.gca()
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=6))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.xticks(rotation=45)
plt.xlim(monthly_totals_all.index.min(), monthly_totals_all.index.max())
plt.axvline(x=train_end, color='gray', linestyle='--', linewidth=1.5, label='Train/Val Split')
plt.axvline(x=val_end, color='black', linestyle='--', linewidth=1.5, label='Val/Test Split')
plt.text(train_end, plt.ylim()[1]*0.95, 'Training set', rotation=90, ha='right', va='top', color='gray')
plt.text(val_end, plt.ylim()[1]*0.95, 'Testing set', rotation=90, ha='right', va='top', color='black')
plt.grid(True, linestyle='--', linewidth=0.5, alpha=0.6)
plt.show()
164/164 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step
# Country-level plots
countries = df_plot['country'].unique()
fig, axes = plt.subplots(len(countries) // 3 + 1, 3, figsize=(18, 3 * (len(countries) // 3 + 1)), sharex=False, sharey=False)
axes = axes.flatten()
for i, country in enumerate(countries):
country_df = df_plot[df_plot['country'] == country].groupby('date')[['actual', 'predicted']].sum()
ax = axes[i]
ax.plot(country_df.index, country_df['actual'], label='Actual')
ax.plot(country_df.index, country_df['predicted'], label='Predicted', linestyle='--')
ax.set_title(country)
# Show x-axis ticks every 3 years
ax.xaxis.set_major_locator(mdates.YearLocator(base=3))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
ax.tick_params(axis='x', rotation=45)
ax.grid(True, linestyle='--', linewidth=0.5, alpha=0.6)
# Add legend only once
if i == 0:
ax.legend()
# Remove unused subplots
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
fig.tight_layout()
fig.suptitle('Actual vs Predicted Monthly Arrivals per Country (2019)', fontsize=16, y=1.02)
plt.show()
# df_plot: DataFrame with columns ['country', 'date', 'actual', 'predicted']
# List of countries
countries = df_plot['country'].unique()
results = []
for country in countries:
# Subset data for this country (test set only, if you want)
country_df = df_plot[(df_plot['country'] == country) & (df_plot['date'] > val_end)]
actual = country_df['actual'].values
pred = country_df['predicted'].values
if len(actual) == 0:
continue # Skip countries with no data in test set
mse = round(mean_squared_error(actual, pred),6)
rmse = np.sqrt(mse)
mae = mean_absolute_error(actual, pred)
mape = mean_absolute_percentage_error(actual, pred)
r2 = r2_score(actual, pred)
results.append({
'country': country,
'MSE': mse,
'RMSE': rmse,
'MAE': mae,
'MAPE': mape,
'R2': r2
})
# Convert to DataFrame for easy export or display
country_metrics = pd.DataFrame(results)
print(country_metrics)
country MSE RMSE MAE \
0 Austria 2.679614e+09 51764.985675 37030.900614
1 Belgium 2.492627e+07 4992.621074 3350.236781
2 Bosnia and Herzegovina 1.842222e+08 13572.847560 10908.470133
3 Canada 7.779840e+07 8820.339876 5862.227914
4 Czech Republic 5.728626e+08 23934.548834 13299.749715
5 Denmark 9.971241e+07 9985.609978 4399.104020
6 Finland 4.199998e+07 6480.738915 5031.262491
7 France 1.101853e+08 10496.917985 7294.615351
8 Germany 4.560198e+10 213546.207054 127152.686279
9 Hungary 6.498556e+08 25492.265914 13569.599833
10 Ireland 3.477323e+07 5896.883205 3741.162018
11 Italy 7.621585e+09 87301.691953 31303.574341
12 Netherlands 1.198660e+08 10948.333395 7456.408488
13 Norway 3.310414e+07 5753.619902 3318.054125
14 Poland 2.317188e+09 48137.174735 28348.363419
15 Romania 2.196468e+07 4686.648987 2272.309680
16 Slovakia 2.515013e+07 5014.990083 2605.888275
17 Slovenia 3.576730e+09 59805.772720 41043.037821
18 Spain 2.336578e+07 4833.815971 3305.533460
19 Sweden 4.620571e+07 6797.478266 4890.971967
20 Switzerland 7.623722e+07 8731.392972 6250.464722
21 USA 8.693620e+08 29484.944652 23078.987376
22 United Kingdom 1.602999e+09 40037.469582 28814.113729
MAPE R2
0 0.317875 0.766548
1 0.327174 0.943330
2 0.368343 0.841933
3 0.354829 0.593108
4 0.285757 0.931426
5 0.454653 0.783175
6 0.584503 0.383765
7 0.182306 0.967303
8 0.360820 0.267050
9 0.315947 0.874721
10 0.399846 0.507424
11 0.202527 0.592406
12 0.327610 0.955578
13 0.363048 0.925196
14 0.379036 0.787249
15 0.254269 0.851325
16 0.244951 0.993074
17 0.405495 0.717762
18 0.251376 0.974041
19 0.429396 0.946978
20 0.289273 0.852513
21 0.473352 0.466255
22 0.360491 0.654020
# Load data
cool = pd.read_csv("C:/Users/Korisnik/Desktop/Škola/THESIS ideas/df_top_cool.csv")
cool.replace('..', np.nan, inplace=True)
cool['date'] = pd.to_datetime(cool[['year', 'month']].assign(day=1))
cool = cool[~cool['country'].isin(['Russian Federation'])]
# Type conversions
cool['year'] = pd.to_numeric(cool['year'], downcast='integer', errors='coerce')
cool['month'] = pd.to_numeric(cool['month'], downcast='integer', errors='coerce')
cool['arrivals'] = pd.to_numeric(cool['arrivals'], errors='coerce')
cool['overnights'] = pd.to_numeric(cool['overnights'], errors='coerce')
cool['unemployment_rate'] = pd.to_numeric(cool['unemployment_rate'], errors='coerce')
cool['exchange_rate'] = pd.to_numeric(cool['exchange_rate'], errors='coerce')
cool['cpi'] = pd.to_numeric(cool['cpi'], errors='coerce')
cool['eu_member'] = cool['eu_member'].astype(int)
# Add Schengen and Euro info
schengen_entry_year = {
'Austria': 1995, 'Belgium': 1995, 'Czech Republic': 2007, 'Denmark': 2001,
'Finland': 1996, 'France': 1995, 'Germany': 1995, 'Hungary': 2007,
'Italy': 1997, 'Netherlands': 1995, 'Norway': 2001, 'Poland': 2007,
'Portugal': 1995, 'Slovakia': 2007, 'Slovenia': 2007, 'Spain': 1995,
'Sweden': 2001, 'Switzerland': 2008, 'Romania': 2024,
}
cool['schengen_member'] = cool.apply(lambda row: int(row['year'] >= schengen_entry_year.get(row['country'], np.inf)), axis=1).astype('int8')
# Add Google Trends
td = pd.read_csv("C:/Users/Korisnik/Desktop/Škola/THESIS ideas/multiTimeline.csv")
td['date'] = pd.to_datetime(td['date']).dt.to_period('M').dt.to_timestamp()
td_long = td.melt(id_vars='date', var_name='country', value_name='google_trends')
cool['date'] = pd.to_datetime(cool['date']).dt.to_period('M').dt.to_timestamp()
cool = cool.merge(td_long, on=['date', 'country'], how='left')
cool['google_trends'] = cool['google_trends'].fillna(-1)
# Log transform + lags
cool = cool.sort_values(['country', 'date']).drop_duplicates(subset=['country', 'date']).reset_index(drop=True)
cool['arrivals'] = np.log1p(cool['arrivals'])
cool['overnights'] = np.log1p(cool['overnights'])
cool['exchange_rate'] = np.log1p(cool['exchange_rate'])
cool['cpi'] = np.log1p(cool['cpi'])
cool['arrivals_next_month'] = cool.groupby('country')['arrivals'].shift(-1)
month_names = {
1: 'January', 2: 'February', 3: 'March', 4: 'April',
5: 'May', 6: 'June', 7: 'July', 8: 'August',
9: 'September', 10: 'October', 11: 'November', 12: 'December'
}
cool['month_name'] = cool['month'].map(month_names)
ohe_month = pd.get_dummies(cool['month_name'], prefix='month').astype(int)
cool = pd.concat([cool.drop(columns=['month_name']), ohe_month], axis=1)
cool['arrivals_lag_1'] = cool.groupby('country')['arrivals'].shift(1)
cool['arrivals_lag_3'] = cool.groupby('country')['arrivals'].shift(3)
cool['arrivals_lag_6'] = cool.groupby('country')['arrivals'].shift(6)
cool['arrivals_lag_12'] = cool.groupby('country')['arrivals'].shift(12)
cool['cpi_lag_1'] = cool.groupby('country')['cpi'].shift(1)
cool['unemp_rate_lag_3'] = cool.groupby('country')['unemployment_rate'].shift(3)
cool['google_trends_lag_1'] = cool.groupby('country')['google_trends'].shift(1)
cool['google_trends_lag_3'] = cool.groupby('country')['google_trends'].shift(3)
cool = cool[cool['date'] >= '2001-01-01']
cool_ns = cool[cool['country'].isin(['Ireland','Bosnia and Herzegovina', 'USA', 'United Kingdom', 'Canada'])].copy()
cool_ns = cool_ns[cool_ns['arrivals_next_month'].notna()].copy()
cool_ns['country_encoded'] = LabelEncoder().fit_transform(cool_ns['country'])
month_cols = [col for col in cool_ns.columns if col.startswith('month_')]
X_numeric = cool_ns[[
'unemp_rate_lag_3', 'exchange_rate', 'cpi_lag_1',
'arrivals_lag_1', 'arrivals_lag_3', 'arrivals_lag_6', 'arrivals_lag_12',
'google_trends_lag_1', 'google_trends_lag_3', 'schengen_member', 'euro_adopted'
] + month_cols].values
X_country_array = cool_ns['country_encoded'].astype('int32').values
y = cool_ns['arrivals_next_month'].values
train_end = pd.Timestamp("2016-12-31")
val_end = pd.Timestamp("2020-12-31")
train_mask = cool_ns['date'] <= train_end
val_mask = (cool_ns['date'] > train_end) & (cool_ns['date'] <= val_end)
test_mask = cool_ns['date'] > val_end
scaler = StandardScaler()
X_num_train = scaler.fit_transform(X_numeric[train_mask])
X_num_val = scaler.transform(X_numeric[val_mask])
X_num_test = scaler.transform(X_numeric[test_mask])
X_numeric_scaled_all = scaler.transform(X_numeric)
X_cat_train = X_country_array[train_mask.to_numpy()]
X_cat_val = X_country_array[val_mask.to_numpy()]
X_cat_test = X_country_array[test_mask.to_numpy()]
y_train = y[train_mask.to_numpy()]
y_val = y[val_mask.to_numpy()]
y_test = y[test_mask.to_numpy()]
input_numeric = Input(shape=(X_num_train.shape[1],), name="numeric_input")
input_country = Input(shape=(1,), dtype='int32', name="country_input")
embedding = Embedding(input_dim=len(np.unique(X_country_array)), output_dim=5)(input_country)
embedding_flat = Flatten()(embedding)
x = Concatenate()([input_numeric, embedding_flat])
x = Dense(64, activation='tanh')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Dense(32, activation='tanh')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
output = Dense(1)(x)
model0 = Model(inputs=[input_numeric, input_country], outputs=output)
model0.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
early_stop = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
history = model0.fit(
[X_num_train, X_cat_train], y_train,
validation_data=([X_num_val, X_cat_val], y_val),
epochs=100, batch_size=16)
model0.summary()
Epoch 1/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 19s 21ms/step - loss: 70.2320 - val_loss: 67.7599 Epoch 2/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 62.0043 - val_loss: 57.4380 Epoch 3/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 3s 28ms/step - loss: 52.5921 - val_loss: 44.6008 Epoch 4/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 2s 20ms/step - loss: 41.1542 - val_loss: 29.7536 Epoch 5/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 2s 37ms/step - loss: 27.9099 - val_loss: 17.7265 Epoch 6/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 4s 38ms/step - loss: 16.7578 - val_loss: 10.7735 Epoch 7/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 2s 18ms/step - loss: 8.8176 - val_loss: 4.5991 Epoch 8/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 18ms/step - loss: 4.9372 - val_loss: 2.4491 Epoch 9/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 16ms/step - loss: 3.5004 - val_loss: 2.2870 Epoch 10/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 2s 21ms/step - loss: 3.5081 - val_loss: 2.2256 Epoch 11/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 8s 103ms/step - loss: 2.7785 - val_loss: 2.1660 Epoch 12/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 3s 23ms/step - loss: 2.7310 - val_loss: 2.1169 Epoch 13/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 14ms/step - loss: 2.6056 - val_loss: 2.0984 Epoch 14/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 2s 17ms/step - loss: 3.0444 - val_loss: 2.1610 Epoch 15/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 20ms/step - loss: 2.6913 - val_loss: 2.1628 Epoch 16/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step - loss: 2.2879 - val_loss: 1.9532 Epoch 17/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 14ms/step - loss: 2.5391 - val_loss: 2.0436 Epoch 18/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step - loss: 1.9582 - val_loss: 2.0355 Epoch 19/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.9699 - val_loss: 2.0712 Epoch 20/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step - loss: 1.8013 - val_loss: 2.0091 Epoch 21/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 3s 43ms/step - loss: 1.8588 - val_loss: 1.9895 Epoch 22/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 4s 24ms/step - loss: 2.0070 - val_loss: 1.8358 Epoch 23/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 12ms/step - loss: 1.8016 - val_loss: 1.9515 Epoch 24/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 14ms/step - loss: 1.9085 - val_loss: 2.0363 Epoch 25/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 1.8029 - val_loss: 2.0660 Epoch 26/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 12ms/step - loss: 1.5171 - val_loss: 2.0622 Epoch 27/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 2s 27ms/step - loss: 1.6722 - val_loss: 2.0327 Epoch 28/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 2s 11ms/step - loss: 1.5666 - val_loss: 1.9213 Epoch 29/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 1.5632 - val_loss: 2.0125 Epoch 30/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step - loss: 1.6246 - val_loss: 1.8871 Epoch 31/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 1.4207 - val_loss: 1.9351 Epoch 32/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.3482 - val_loss: 1.9312 Epoch 33/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.4414 - val_loss: 1.8715 Epoch 34/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - loss: 1.1816 - val_loss: 1.9154 Epoch 35/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 12ms/step - loss: 1.3311 - val_loss: 1.9495 Epoch 36/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.3297 - val_loss: 1.8732 Epoch 37/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.3147 - val_loss: 1.8953 Epoch 38/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 1.3075 - val_loss: 1.9057 Epoch 39/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.2924 - val_loss: 1.8423 Epoch 40/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.2604 - val_loss: 1.8251 Epoch 41/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 1.2987 - val_loss: 1.8850 Epoch 42/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.2480 - val_loss: 1.8663 Epoch 43/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 1.2543 - val_loss: 1.8764 Epoch 44/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 1.1732 - val_loss: 1.8326 Epoch 45/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.0759 - val_loss: 1.8918 Epoch 46/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 1.0416 - val_loss: 1.8623 Epoch 47/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 1.0965 - val_loss: 1.9172 Epoch 48/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.0024 - val_loss: 1.9087 Epoch 49/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.0457 - val_loss: 1.9199 Epoch 50/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 0.8859 - val_loss: 1.8941 Epoch 51/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 0.9685 - val_loss: 1.8796 Epoch 52/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.1287 - val_loss: 1.8473 Epoch 53/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 14ms/step - loss: 1.0485 - val_loss: 1.8785 Epoch 54/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.0335 - val_loss: 1.8847 Epoch 55/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - loss: 0.9415 - val_loss: 1.8722 Epoch 56/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - loss: 0.9807 - val_loss: 1.8616 Epoch 57/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - loss: 0.9180 - val_loss: 1.9059 Epoch 58/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - loss: 1.0627 - val_loss: 1.8404 Epoch 59/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 0.8911 - val_loss: 1.9043 Epoch 60/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 0.9765 - val_loss: 1.8932 Epoch 61/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 0.9974 - val_loss: 1.8229 Epoch 62/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 1.0363 - val_loss: 1.8347 Epoch 63/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 0.8997 - val_loss: 1.8874 Epoch 64/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 0.8870 - val_loss: 1.8901 Epoch 65/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 0.8145 - val_loss: 1.8895 Epoch 66/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 0.8541 - val_loss: 1.9292 Epoch 67/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 0.8720 - val_loss: 1.8794 Epoch 68/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step - loss: 0.9244 - val_loss: 1.9101 Epoch 69/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 0.8384 - val_loss: 1.9313 Epoch 70/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 16ms/step - loss: 0.8639 - val_loss: 1.9346 Epoch 71/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 0.9237 - val_loss: 1.9323 Epoch 72/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 22ms/step - loss: 0.8753 - val_loss: 1.9283 Epoch 73/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 14ms/step - loss: 0.8214 - val_loss: 1.9219 Epoch 74/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 0.8923 - val_loss: 1.9176 Epoch 75/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - loss: 0.8144 - val_loss: 1.8879 Epoch 76/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 0.8084 - val_loss: 1.8796 Epoch 77/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 0.7641 - val_loss: 1.8786 Epoch 78/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 0.8447 - val_loss: 1.8973 Epoch 79/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step - loss: 0.7786 - val_loss: 1.9153 Epoch 80/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step - loss: 0.8689 - val_loss: 1.8983 Epoch 81/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 0.8436 - val_loss: 1.9101 Epoch 82/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 0.8528 - val_loss: 1.9517 Epoch 83/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 0.7476 - val_loss: 1.9288 Epoch 84/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 0.7558 - val_loss: 1.8960 Epoch 85/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 0.7617 - val_loss: 1.9375 Epoch 86/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 0.8155 - val_loss: 1.9272 Epoch 87/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 0.8460 - val_loss: 1.9310 Epoch 88/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 0.7485 - val_loss: 1.9583 Epoch 89/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 0.7601 - val_loss: 1.9537 Epoch 90/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 0.8471 - val_loss: 1.8919 Epoch 91/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - loss: 0.6934 - val_loss: 1.9042 Epoch 92/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 0.7554 - val_loss: 1.9332 Epoch 93/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - loss: 0.7792 - val_loss: 1.9139 Epoch 94/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 0.7224 - val_loss: 1.9408 Epoch 95/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 0.7511 - val_loss: 1.9806 Epoch 96/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 0.6967 - val_loss: 1.9740 Epoch 97/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - loss: 0.7939 - val_loss: 1.9248 Epoch 98/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 0.7514 - val_loss: 1.9794 Epoch 99/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - loss: 0.8539 - val_loss: 1.9747 Epoch 100/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - loss: 0.7982 - val_loss: 2.0086
Model: "functional_38"
┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ Connected to ┃ ┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩ │ country_input │ (None, 1) │ 0 │ - │ │ (InputLayer) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ embedding_38 │ (None, 1, 5) │ 25 │ country_input[0]… │ │ (Embedding) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ numeric_input │ (None, 25) │ 0 │ - │ │ (InputLayer) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ flatten_38 │ (None, 5) │ 0 │ embedding_38[0][… │ │ (Flatten) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ concatenate_38 │ (None, 30) │ 0 │ numeric_input[0]… │ │ (Concatenate) │ │ │ flatten_38[0][0] │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dense_114 (Dense) │ (None, 64) │ 1,984 │ concatenate_38[0… │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ batch_normalizatio… │ (None, 64) │ 256 │ dense_114[0][0] │ │ (BatchNormalizatio… │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dropout_76 │ (None, 64) │ 0 │ batch_normalizat… │ │ (Dropout) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dense_115 (Dense) │ (None, 32) │ 2,080 │ dropout_76[0][0] │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ batch_normalizatio… │ (None, 32) │ 128 │ dense_115[0][0] │ │ (BatchNormalizatio… │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dropout_77 │ (None, 32) │ 0 │ batch_normalizat… │ │ (Dropout) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dense_116 (Dense) │ (None, 1) │ 33 │ dropout_77[0][0] │ └─────────────────────┴───────────────────┴────────────┴───────────────────┘
Total params: 13,136 (51.32 KB)
Trainable params: 4,314 (16.85 KB)
Non-trainable params: 192 (768.00 B)
Optimizer params: 8,630 (33.71 KB)
## Model evalauation
# Predict and flatten
train_preds = model0.predict([X_num_train, X_cat_train]).flatten()
test_preds = model0.predict([X_num_test, X_cat_test]).flatten()
# MSE
train_mse = mean_squared_error(y_train, train_preds)
test_mse = mean_squared_error(y_test, test_preds)
# RMSE
train_rmse0 = np.sqrt(train_mse)
test_rmse0 = np.sqrt(test_mse)
#MAE
train_mae0 = mean_absolute_error(y_train, train_preds)
test_mae0 = mean_absolute_error(y_test, test_preds)
# MAPE
train_mape0 = mean_absolute_percentage_error(y_train, train_preds)
test_mape0 = mean_absolute_percentage_error(y_test, test_preds)
# R-squared
train_r2_0 = r2_score(y_train, train_preds)
test_r2_0 = r2_score(y_test, test_preds)
# Print all metrics
print(f"Train MSE: {train_mse:.4f}, RMSE: {train_rmse0:.4f}, MAE: {train_mae0:.4f}, MAPE: {train_mape0:.4f}, R²: {train_r2_0:.4f}")
print(f"Test MSE: {test_mse:.4f}, RMSE: {test_rmse0:.4f}, MAE: {test_mae0:.4f}, MAPE: {test_mape0:.4f}, R²: {test_r2_0:.4f}")
# Plot training history
plt.figure(figsize=(10, 4))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training History')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.legend()
plt.grid(True)
plt.show()
30/30 ━━━━━━━━━━━━━━━━━━━━ 1s 18ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step Train MSE: 0.0663, RMSE: 0.2575, MAE: 0.1989, MAPE: 0.0252, R²: 0.9742 Test MSE: 0.5710, RMSE: 0.7556, MAE: 0.6125, MAPE: 0.0716, R²: 0.8293
# Global trend plot (total actual vs predicted per month)
y_pred_all = model0.predict([X_numeric_scaled_all, X_country_array]).flatten()
y_actual_all = np.expm1(y)
y_pred_all = np.expm1(y_pred_all)
df_plot = cool_ns.copy()
df_plot['actual'] = y_actual_all
df_plot['predicted'] = y_pred_all
monthly_totals_all = df_plot.groupby('date')[['actual', 'predicted']].sum()
plt.figure(figsize=(16, 6))
plt.plot(monthly_totals_all.index, monthly_totals_all['actual'], label='Total Actual', linewidth=2)
plt.plot(monthly_totals_all.index, monthly_totals_all['predicted'], label='Total Predicted', linestyle='--', alpha=0.8)
plt.title("Total Monthly Arrivals Subset Non-Schengen(2024)")
plt.xlabel("Date")
plt.ylabel("Total Arrivals")
plt.xticks(rotation=90)
plt.ticklabel_format(style='plain', axis='y')
plt.legend()
plt.tight_layout()
ax = plt.gca()
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=6))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.xticks(rotation=45)
plt.xlim(monthly_totals_all.index.min(), monthly_totals_all.index.max())
plt.axvline(x=train_end, color='gray', linestyle='--', linewidth=1.5, label='Train/Val Split')
plt.axvline(x=val_end, color='black', linestyle='--', linewidth=1.5, label='Val/Test Split')
plt.text(train_end, plt.ylim()[1]*0.95, 'Training set', rotation=90, ha='right', va='top', color='gray')
plt.text(val_end, plt.ylim()[1]*0.95, 'Testing set', rotation=90, ha='right', va='top', color='black')
plt.grid(True, linestyle='--', linewidth=0.5, alpha=0.6)
plt.show()
45/45 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step
# Country-level plots
countries = df_plot['country'].unique()
fig, axes = plt.subplots(len(countries) // 3 + 1, 3, figsize=(18, 3 * (len(countries) // 3 + 1)), sharex=False, sharey=False)
axes = axes.flatten()
for i, country in enumerate(countries):
country_df = df_plot[df_plot['country'] == country].groupby('date')[['actual', 'predicted']].sum()
ax = axes[i]
ax.plot(country_df.index, country_df['actual'], label='Actual')
ax.plot(country_df.index, country_df['predicted'], label='Predicted', linestyle='--')
ax.set_title(country)
# Show x-axis ticks every 3 years
ax.xaxis.set_major_locator(mdates.YearLocator(base=3))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
ax.tick_params(axis='x', rotation=45)
ax.grid(True, linestyle='--', linewidth=0.5, alpha=0.6)
# Add legend only once
if i == 0:
ax.legend()
# Remove unused subplots
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
fig.tight_layout()
fig.suptitle('Actual vs Predicted Monthly Arrivals for Non Schengen Countries (2024)', fontsize=16, y=1.02)
plt.show()
# df_plot: DataFrame with columns ['country', 'date', 'actual', 'predicted']
# List of countries
countries = df_plot['country'].unique()
results = []
for country in countries:
# Subset data for this country (test set only, if you want)
country_df = df_plot[(df_plot['country'] == country) & (df_plot['date'] > val_end)]
actual = country_df['actual'].values
pred = country_df['predicted'].values
if len(actual) == 0:
continue # Skip countries with no data in test set
mse = round(mean_squared_error(actual, pred),6)
rmse = np.sqrt(mse)
mae = mean_absolute_error(actual, pred)
mape = mean_absolute_percentage_error(actual, pred)
r2 = r2_score(actual, pred)
results.append({
'country': country,
'MSE': mse,
'RMSE': rmse,
'MAE': mae,
'MAPE': mape,
'R2': r2
})
# Convert to DataFrame for easy export or display
country_metrics = pd.DataFrame(results)
print(country_metrics)
country MSE RMSE MAE MAPE \
0 Bosnia and Herzegovina 4.089451e+08 20222.391154 13250.893027 0.470344
1 Canada 1.519023e+07 3897.463996 2464.991178 0.581352
2 Ireland 2.961782e+07 5442.225958 3381.523684 0.592970
3 USA 1.114507e+09 33384.228835 23627.993380 0.564387
4 United Kingdom 2.290262e+09 47856.685346 31533.607595 0.891014
R2
0 0.637028
1 0.866758
2 0.588191
3 0.421547
4 0.268385
# Baseline predictions
baseline_preds = model0.predict([X_num_test, X_cat_test]).flatten()
baseline_mse = mean_squared_error(y_test, baseline_preds)
feature_names = [
'unemployment_rate', 'exchange_rate', 'cpi_lag_1',
'arrivals_lag_1', 'arrivals_lag_3', 'arrivals_lag_6', 'arrivals_lag_12',
'google_trends_lag_1', 'google_trends_lag_3', 'schengen_member', 'euro_adopted']+month_cols
importances = []
for i in range(X_num_test.shape[1]):
X_permuted = X_num_test.copy()
X_permuted[:, i] = np.random.permutation(X_permuted[:, i])
permuted_preds = model0.predict([X_permuted, X_cat_test]).flatten()
permuted_mse = mean_squared_error(y_test, permuted_preds)
importances.append(max(0, permuted_mse - baseline_mse))
#Handle grouped month dummies so its not one bar for each month
month_indices = [i for i, f in enumerate(feature_names) if f.startswith('month_')]
if month_indices:
# Safe grouped permutation
X_month_permuted = X_num_test.copy()
row_perm = np.random.permutation(len(X_month_permuted))
X_month_permuted[:, month_indices] = X_month_permuted[row_perm][:, month_indices]
month_preds = model0.predict([X_month_permuted, X_cat_test]).flatten()
month_importance = mean_squared_error(y_test, month_preds) - baseline_mse
else:
month_importance = 0 # fallback if no month cols present
filtered_names = [f for i, f in enumerate(feature_names) if i not in month_indices]
filtered_importances = [imp for i, imp in enumerate(importances) if i not in month_indices]
feature_names_final = filtered_names + ['month_group']
importances_final = filtered_importances + [month_importance]
sorted_pairs = sorted(zip(importances_final, feature_names_final), reverse=True)
importances_sorted, feature_names_sorted = zip(*sorted_pairs)
plt.figure(figsize=(12, 6))
bars = plt.barh(feature_names_sorted, importances_sorted)
plt.xlabel("Increase in MSE when shuffled")
plt.title("Arrivals (Non-Schengen countries 2024)")
plt.gca().invert_yaxis()
# Add labels
for bar in bars:
width = bar.get_width()
plt.text(width + 0.001, bar.get_y() + bar.get_height() / 2,
f"{width:.4f}", va='center')
plt.tight_layout()
plt.show()
8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 12ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 11ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 12ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 20ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 11ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 17ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 10ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 10ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 18ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 10ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 10ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 10ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 10ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 11ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 11ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 11ms/step
# Load data
cool = pd.read_csv("C:/Users/Korisnik/Desktop/Škola/THESIS ideas/df_top_cool.csv")
cool.replace('..', np.nan, inplace=True)
cool['date'] = pd.to_datetime(cool[['year', 'month']].assign(day=1))
cool = cool[~cool['country'].isin(['Russian Federation'])]
# Type conversions
cool['year'] = pd.to_numeric(cool['year'], downcast='integer', errors='coerce')
cool['month'] = pd.to_numeric(cool['month'], downcast='integer', errors='coerce')
cool['arrivals'] = pd.to_numeric(cool['arrivals'], errors='coerce')
cool['overnights'] = pd.to_numeric(cool['overnights'], errors='coerce')
cool['unemployment_rate'] = pd.to_numeric(cool['unemployment_rate'], errors='coerce')
cool['exchange_rate'] = pd.to_numeric(cool['exchange_rate'], errors='coerce')
cool['cpi'] = pd.to_numeric(cool['cpi'], errors='coerce')
cool['eu_member'] = cool['eu_member'].astype(int)
# Add Schengen and Euro info
schengen_entry_year = {
'Austria': 1995, 'Belgium': 1995, 'Czech Republic': 2007, 'Denmark': 2001,
'Finland': 1996, 'France': 1995, 'Germany': 1995, 'Hungary': 2007,
'Italy': 1997, 'Netherlands': 1995, 'Norway': 2001, 'Poland': 2007,
'Portugal': 1995, 'Slovakia': 2007, 'Slovenia': 2007, 'Spain': 1995,
'Sweden': 2001, 'Switzerland': 2008, 'Romania': 2024,
}
cool['schengen_member'] = cool.apply(lambda row: int(row['year'] >= schengen_entry_year.get(row['country'], np.inf)), axis=1).astype('int8')
# Add Google Trends
td = pd.read_csv("C:/Users/Korisnik/Desktop/Škola/THESIS ideas/multiTimeline.csv")
td['date'] = pd.to_datetime(td['date']).dt.to_period('M').dt.to_timestamp()
td_long = td.melt(id_vars='date', var_name='country', value_name='google_trends')
cool['date'] = pd.to_datetime(cool['date']).dt.to_period('M').dt.to_timestamp()
cool = cool.merge(td_long, on=['date', 'country'], how='left')
cool['google_trends'] = cool['google_trends'].fillna(-1)
# Log transform + lags
cool = cool.sort_values(['country', 'date']).drop_duplicates(subset=['country', 'date']).reset_index(drop=True)
cool['arrivals'] = np.log1p(cool['arrivals'])
cool['overnights'] = np.log1p(cool['overnights'])
cool['exchange_rate'] = np.log1p(cool['exchange_rate'])
cool['cpi'] = np.log1p(cool['cpi'])
cool['arrivals_next_month'] = cool.groupby('country')['arrivals'].shift(-1)
month_names = {
1: 'January', 2: 'February', 3: 'March', 4: 'April',
5: 'May', 6: 'June', 7: 'July', 8: 'August',
9: 'September', 10: 'October', 11: 'November', 12: 'December'
}
cool['month_name'] = cool['month'].map(month_names)
ohe_month = pd.get_dummies(cool['month_name'], prefix='month').astype(int)
cool = pd.concat([cool.drop(columns=['month_name']), ohe_month], axis=1)
cool['arrivals_lag_1'] = cool.groupby('country')['arrivals'].shift(1)
cool['arrivals_lag_3'] = cool.groupby('country')['arrivals'].shift(3)
cool['arrivals_lag_6'] = cool.groupby('country')['arrivals'].shift(6)
cool['arrivals_lag_12'] = cool.groupby('country')['arrivals'].shift(12)
cool['cpi_lag_1'] = cool.groupby('country')['cpi'].shift(1)
cool['unemp_rate_lag_3'] = cool.groupby('country')['unemployment_rate'].shift(3)
cool['google_trends_lag_1'] = cool.groupby('country')['google_trends'].shift(1)
cool['google_trends_lag_3'] = cool.groupby('country')['google_trends'].shift(3)
cool = cool[cool['date'] >= '2001-01-01']
cool_sub = cool[cool['country'].isin(['Germany', 'Italy', 'Poland', 'Slovenia', 'Switzerland'])].copy()
cool_sub = cool_sub[cool_sub['arrivals_next_month'].notna()].copy()
cool_sub['country_encoded'] = LabelEncoder().fit_transform(cool_sub['country'])
month_cols = [col for col in cool_sub.columns if col.startswith('month_')]
X_numeric = cool_sub[[
'unemp_rate_lag_3', 'exchange_rate', 'cpi_lag_1',
'arrivals_lag_1', 'arrivals_lag_3', 'arrivals_lag_6', 'arrivals_lag_12',
'google_trends_lag_1', 'google_trends_lag_3', 'schengen_member', 'euro_adopted'
] + month_cols].values
X_country_array = cool_sub['country_encoded'].astype('int32').values
y = cool_sub['arrivals_next_month'].values
train_end = pd.Timestamp("2016-12-31")
val_end = pd.Timestamp("2020-12-31")
train_mask = cool_sub['date'] <= train_end
val_mask = (cool_sub['date'] > train_end) & (cool_sub['date'] <= val_end)
test_mask = cool_sub['date'] > val_end
scaler = StandardScaler()
X_num_train = scaler.fit_transform(X_numeric[train_mask.to_numpy()])
X_num_val = scaler.transform(X_numeric[val_mask.to_numpy()])
X_num_test = scaler.transform(X_numeric[test_mask.to_numpy()])
X_numeric_scaled_all = scaler.transform(X_numeric)
X_cat_train = X_country_array[train_mask.to_numpy()]
X_cat_val = X_country_array[val_mask.to_numpy()]
X_cat_test = X_country_array[test_mask.to_numpy()]
y_train = y[train_mask.to_numpy()]
y_val = y[val_mask.to_numpy()]
y_test = y[test_mask.to_numpy()]
input_numeric = Input(shape=(X_num_train.shape[1],), name="numeric_input")
input_country = Input(shape=(1,), dtype='int32', name="country_input")
embedding = Embedding(input_dim=len(np.unique(X_country_array)), output_dim=5)(input_country)
embedding_flat = Flatten()(embedding)
x = Concatenate()([input_numeric, embedding_flat])
x = Dense(64, activation='tanh')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Dense(32, activation='tanh')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
output = Dense(1)(x)
model0 = Model(inputs=[input_numeric, input_country], outputs=output)
model0.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
early_stop = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
history = model0.fit(
[X_num_train, X_cat_train], y_train,
validation_data=([X_num_val, X_cat_val], y_val),
epochs=100, batch_size=16)
model0.summary()
Epoch 1/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 9s 17ms/step - loss: 99.8589 - val_loss: 87.8250 Epoch 2/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step - loss: 89.2531 - val_loss: 79.5005 Epoch 3/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - loss: 78.0134 - val_loss: 64.5254 Epoch 4/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 64.2394 - val_loss: 46.4489 Epoch 5/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 48.8235 - val_loss: 31.7959 Epoch 6/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - loss: 30.2262 - val_loss: 16.8680 Epoch 7/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 17.0672 - val_loss: 8.3088 Epoch 8/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 9.8194 - val_loss: 3.9473 Epoch 9/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 5.7126 - val_loss: 2.6425 Epoch 10/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 4.5960 - val_loss: 2.3715 Epoch 11/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 3.9489 - val_loss: 2.1418 Epoch 12/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 4.2142 - val_loss: 2.1984 Epoch 13/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 3.3642 - val_loss: 2.1295 Epoch 14/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - loss: 3.4298 - val_loss: 2.3898 Epoch 15/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step - loss: 3.0317 - val_loss: 2.1423 Epoch 16/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 3.0056 - val_loss: 2.0869 Epoch 17/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 3.1760 - val_loss: 2.1471 Epoch 18/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - loss: 2.8644 - val_loss: 2.0593 Epoch 19/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 2.5819 - val_loss: 1.9651 Epoch 20/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - loss: 2.6302 - val_loss: 2.0754 Epoch 21/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 2.3172 - val_loss: 1.9571 Epoch 22/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 2.6045 - val_loss: 1.9875 Epoch 23/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 2.3572 - val_loss: 2.0511 Epoch 24/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 2.2594 - val_loss: 1.9282 Epoch 25/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 2.1104 - val_loss: 1.9812 Epoch 26/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - loss: 1.9974 - val_loss: 2.0113 Epoch 27/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 2.0764 - val_loss: 1.9094 Epoch 28/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 2.1980 - val_loss: 1.9146 Epoch 29/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 2.1560 - val_loss: 1.9428 Epoch 30/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 2.0843 - val_loss: 1.8962 Epoch 31/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 2.0152 - val_loss: 1.9258 Epoch 32/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.8557 - val_loss: 1.9844 Epoch 33/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 1.8506 - val_loss: 1.9047 Epoch 34/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 1.8798 - val_loss: 1.9965 Epoch 35/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - loss: 1.5983 - val_loss: 1.9403 Epoch 36/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 1.8010 - val_loss: 1.8881 Epoch 37/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.6613 - val_loss: 1.9787 Epoch 38/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.7772 - val_loss: 1.9763 Epoch 39/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - loss: 1.5918 - val_loss: 1.9536 Epoch 40/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - loss: 1.8109 - val_loss: 1.9297 Epoch 41/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.7184 - val_loss: 1.9111 Epoch 42/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.6047 - val_loss: 1.9123 Epoch 43/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.5846 - val_loss: 1.9302 Epoch 44/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 1.6097 - val_loss: 1.8821 Epoch 45/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.6324 - val_loss: 1.9088 Epoch 46/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 16ms/step - loss: 1.5535 - val_loss: 1.9347 Epoch 47/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.5074 - val_loss: 1.9502 Epoch 48/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - loss: 1.6296 - val_loss: 1.9467 Epoch 49/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.4934 - val_loss: 1.9260 Epoch 50/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.5551 - val_loss: 1.8559 Epoch 51/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.4930 - val_loss: 1.9036 Epoch 52/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - loss: 1.3361 - val_loss: 1.8649 Epoch 53/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.4073 - val_loss: 1.8501 Epoch 54/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 1.2863 - val_loss: 1.8864 Epoch 55/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.3986 - val_loss: 1.8456 Epoch 56/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step - loss: 1.3960 - val_loss: 1.8984 Epoch 57/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - loss: 1.5584 - val_loss: 1.8663 Epoch 58/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.2133 - val_loss: 1.8410 Epoch 59/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.2921 - val_loss: 1.8973 Epoch 60/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.2267 - val_loss: 1.8420 Epoch 61/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.2583 - val_loss: 1.9192 Epoch 62/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.1876 - val_loss: 1.8602 Epoch 63/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.1679 - val_loss: 1.8681 Epoch 64/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 12ms/step - loss: 1.2488 - val_loss: 1.8632 Epoch 65/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.2227 - val_loss: 1.9012 Epoch 66/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.3660 - val_loss: 1.9421 Epoch 67/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.2374 - val_loss: 1.8699 Epoch 68/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.2288 - val_loss: 1.8792 Epoch 69/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.2381 - val_loss: 1.9147 Epoch 70/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.2093 - val_loss: 1.9194 Epoch 71/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.2100 - val_loss: 1.9425 Epoch 72/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.2448 - val_loss: 1.9157 Epoch 73/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.2881 - val_loss: 1.8918 Epoch 74/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.1497 - val_loss: 1.8765 Epoch 75/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 1.1248 - val_loss: 1.9502 Epoch 76/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.2107 - val_loss: 1.9071 Epoch 77/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.3737 - val_loss: 1.8920 Epoch 78/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.0983 - val_loss: 1.8943 Epoch 79/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.1360 - val_loss: 1.9241 Epoch 80/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - loss: 1.1182 - val_loss: 1.9082 Epoch 81/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - loss: 1.2238 - val_loss: 1.9105 Epoch 82/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 1.0812 - val_loss: 1.8724 Epoch 83/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - loss: 1.1690 - val_loss: 1.9216 Epoch 84/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 19ms/step - loss: 1.1707 - val_loss: 1.9185 Epoch 85/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 1.3844 - val_loss: 1.8987 Epoch 86/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.0622 - val_loss: 1.8899 Epoch 87/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 1.1738 - val_loss: 1.8526 Epoch 88/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.1046 - val_loss: 1.8631 Epoch 89/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.2165 - val_loss: 1.9145 Epoch 90/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.0262 - val_loss: 1.8874 Epoch 91/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - loss: 1.1927 - val_loss: 1.9025 Epoch 92/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.1282 - val_loss: 1.9364 Epoch 93/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.1627 - val_loss: 1.8884 Epoch 94/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.1126 - val_loss: 1.9321 Epoch 95/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.1513 - val_loss: 1.8814 Epoch 96/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - loss: 1.1160 - val_loss: 1.8828 Epoch 97/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.0667 - val_loss: 1.9086 Epoch 98/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.0442 - val_loss: 1.8893 Epoch 99/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.1034 - val_loss: 1.8814 Epoch 100/100 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - loss: 1.1580 - val_loss: 1.8957
Model: "functional_40"
┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ Connected to ┃ ┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩ │ country_input │ (None, 1) │ 0 │ - │ │ (InputLayer) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ embedding_40 │ (None, 1, 5) │ 25 │ country_input[0]… │ │ (Embedding) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ numeric_input │ (None, 25) │ 0 │ - │ │ (InputLayer) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ flatten_40 │ (None, 5) │ 0 │ embedding_40[0][… │ │ (Flatten) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ concatenate_40 │ (None, 30) │ 0 │ numeric_input[0]… │ │ (Concatenate) │ │ │ flatten_40[0][0] │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dense_120 (Dense) │ (None, 64) │ 1,984 │ concatenate_40[0… │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ batch_normalizatio… │ (None, 64) │ 256 │ dense_120[0][0] │ │ (BatchNormalizatio… │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dropout_80 │ (None, 64) │ 0 │ batch_normalizat… │ │ (Dropout) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dense_121 (Dense) │ (None, 32) │ 2,080 │ dropout_80[0][0] │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ batch_normalizatio… │ (None, 32) │ 128 │ dense_121[0][0] │ │ (BatchNormalizatio… │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dropout_81 │ (None, 32) │ 0 │ batch_normalizat… │ │ (Dropout) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dense_122 (Dense) │ (None, 1) │ 33 │ dropout_81[0][0] │ └─────────────────────┴───────────────────┴────────────┴───────────────────┘
Total params: 13,136 (51.32 KB)
Trainable params: 4,314 (16.85 KB)
Non-trainable params: 192 (768.00 B)
Optimizer params: 8,630 (33.71 KB)
# Baseline predictions
baseline_preds = model0.predict([X_num_test, X_cat_test]).flatten()
baseline_mse = mean_squared_error(y_test, baseline_preds)
feature_names = [
'unemployment_rate', 'exchange_rate', 'cpi_lag_1',
'arrivals_lag_1', 'arrivals_lag_3', 'arrivals_lag_6', 'arrivals_lag_12',
'google_trends_lag_1', 'google_trends_lag_3', 'schengen_member', 'euro_adopted']+month_cols
importances = []
for i in range(X_num_test.shape[1]):
X_permuted = X_num_test.copy()
X_permuted[:, i] = np.random.permutation(X_permuted[:, i])
permuted_preds = model0.predict([X_permuted, X_cat_test]).flatten()
permuted_mse = mean_squared_error(y_test, permuted_preds)
importances.append(permuted_mse - baseline_mse)
#Handle grouped month dummies so its not one bar for each month
month_indices = [i for i, f in enumerate(feature_names) if f.startswith('month_')]
if month_indices:
# Safe grouped permutation
X_month_permuted = X_num_test.copy()
row_perm = np.random.permutation(len(X_month_permuted))
X_month_permuted[:, month_indices] = X_month_permuted[row_perm][:, month_indices]
month_preds = model0.predict([X_month_permuted, X_cat_test]).flatten()
month_importance = mean_squared_error(y_test, month_preds) - baseline_mse
else:
month_importance = 0 # fallback if no month cols present
filtered_names = [f for i, f in enumerate(feature_names) if i not in month_indices]
filtered_importances = [imp for i, imp in enumerate(importances) if i not in month_indices]
feature_names_final = filtered_names + ['month_group']
importances_final = filtered_importances + [month_importance]
sorted_pairs = sorted(zip(importances_final, feature_names_final), reverse=True)
importances_sorted, feature_names_sorted = zip(*sorted_pairs)
plt.figure(figsize=(12, 6))
bars = plt.barh(feature_names_sorted, importances_sorted)
plt.xlabel("Increase in MSE when shuffled")
plt.title("Model 1: Arrivals (2019)")
plt.gca().invert_yaxis()
# Add labels
for bar in bars:
width = bar.get_width()
plt.text(width + 0.001, bar.get_y() + bar.get_height() / 2,
f"{width:.4f}", va='center')
plt.tight_layout()
plt.show()
8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 41ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 11ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 11ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 18ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 16ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 11ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 11ms/step
## Model evalauation
# Predict and flatten
train_preds = model0.predict([X_num_train, X_cat_train]).flatten()
test_preds = model0.predict([X_num_test, X_cat_test]).flatten()
# MSE
train_mse = mean_squared_error(y_train, train_preds)
test_mse = mean_squared_error(y_test, test_preds)
# RMSE
train_rmse0 = np.sqrt(train_mse)
test_rmse0 = np.sqrt(test_mse)
#MAE
train_mae0 = mean_absolute_error(y_train, train_preds)
test_mae0 = mean_absolute_error(y_test, test_preds)
# MAPE
train_mape0 = mean_absolute_percentage_error(y_train, train_preds)
test_mape0 = mean_absolute_percentage_error(y_test, test_preds)
# R-squared
train_r2_0 = r2_score(y_train, train_preds)
test_r2_0 = r2_score(y_test, test_preds)
# Print all metrics
print(f"Train MSE: {train_mse:.4f}, RMSE: {train_rmse0:.4f}, MAE: {train_mae0:.4f}, MAPE: {train_mape0:.4f}, R²: {train_r2_0:.4f}")
print(f"Test MSE: {test_mse:.4f}, RMSE: {test_rmse0:.4f}, MAE: {test_mae0:.4f}, MAPE: {test_mape0:.4f}, R²: {test_r2_0:.4f}")
# Plot training history
plt.figure(figsize=(10, 4))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training History')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.legend()
plt.grid(True)
plt.show()
23/30 ━━━━━━━━━━━━━━━━━━━━ 0s 11ms/step30/30 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step 8/8 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step Train MSE: 0.0673, RMSE: 0.2595, MAE: 0.2090, MAPE: 0.0223, R²: 0.9806 Test MSE: 0.4528, RMSE: 0.6729, MAE: 0.5146, MAPE: 0.0503, R²: 0.8526
# Global trend plot (total actual vs predicted per month)
y_pred_all = model0.predict([X_numeric_scaled_all, X_country_array]).flatten()
y_actual_all = np.expm1(y)
y_pred_all = np.expm1(y_pred_all)
df_plot = cool_sub.copy()
df_plot['actual'] = y_actual_all
df_plot['predicted'] = y_pred_all
monthly_totals_all = df_plot.groupby('date')[['actual', 'predicted']].sum()
plt.figure(figsize=(16, 6))
plt.plot(monthly_totals_all.index, monthly_totals_all['actual'], label='Total Actual', linewidth=2)
plt.plot(monthly_totals_all.index, monthly_totals_all['predicted'], label='Total Predicted', linestyle='--', alpha=0.8)
plt.title("Total Monthly Arrivals Subset(2024)")
plt.xlabel("Date")
plt.ylabel("Total Overnights")
plt.xticks(rotation=90)
plt.ticklabel_format(style='plain', axis='y')
plt.legend()
plt.tight_layout()
ax = plt.gca()
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=6))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.xticks(rotation=45)
plt.xlim(monthly_totals_all.index.min(), monthly_totals_all.index.max())
plt.axvline(x=train_end, color='gray', linestyle='--', linewidth=1.5, label='Train/Val Split')
plt.axvline(x=val_end, color='black', linestyle='--', linewidth=1.5, label='Val/Test Split')
plt.text(train_end, plt.ylim()[1]*0.95, 'Training set', rotation=90, ha='right', va='top', color='gray')
plt.text(val_end, plt.ylim()[1]*0.95, 'Testing set', rotation=90, ha='right', va='top', color='black')
plt.grid(True, linestyle='--', linewidth=0.5, alpha=0.6)
plt.show()
45/45 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step
# Country-level plots
countries = df_plot['country'].unique()
fig, axes = plt.subplots(len(countries) // 3 + 1, 3, figsize=(18, 3 * (len(countries) // 3 + 1)), sharex=False, sharey=False)
axes = axes.flatten()
for i, country in enumerate(countries):
country_df = df_plot[df_plot['country'] == country].groupby('date')[['actual', 'predicted']].sum()
ax = axes[i]
ax.plot(country_df.index, country_df['actual'], label='Actual')
ax.plot(country_df.index, country_df['predicted'], label='Predicted', linestyle='--')
ax.set_title(country)
# Show x-axis ticks every 3 years
ax.xaxis.set_major_locator(mdates.YearLocator(base=3))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
ax.tick_params(axis='x', rotation=45)
ax.grid(True, linestyle='--', linewidth=0.5, alpha=0.6)
# Add legend only once
if i == 0:
ax.legend()
# Remove unused subplots
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
fig.tight_layout()
fig.suptitle('Actual vs Predicted Monthly Arrivals (2024)', fontsize=16, y=1.02)
plt.show()
# df_plot: DataFrame with columns ['country', 'date', 'actual', 'predicted']
# List of countries
countries = df_plot['country'].unique()
results = []
for country in countries:
# Subset data for this country (test set only, if you want)
country_df = df_plot[(df_plot['country'] == country) & (df_plot['date'] > val_end)]
actual = country_df['actual'].values
pred = country_df['predicted'].values
if len(actual) == 0:
continue # Skip countries with no data in test set
mse = round(mean_squared_error(actual, pred),6)
rmse = np.sqrt(mse)
mae = mean_absolute_error(actual, pred)
mape = mean_absolute_percentage_error(actual, pred)
r2 = r2_score(actual, pred)
results.append({
'country': country,
'MSE': mse,
'RMSE': rmse,
'MAE': mae,
'MAPE': mape,
'R2': r2
})
# Convert to DataFrame for easy export or display
country_metrics = pd.DataFrame(results)
print(country_metrics)
country MSE RMSE MAE MAPE R2 0 Germany 5.353487e+10 231376.028628 139198.397871 0.511892 0.415365 1 Italy 1.070266e+09 32714.921875 15733.568666 0.368516 0.877356 2 Poland 7.211035e+09 84917.810718 50630.250026 0.458698 0.469374 3 Slovenia 2.133528e+09 46190.123747 34501.225409 0.574672 0.827616 4 Switzerland 1.087656e+08 10429.073923 6089.707087 0.318848 0.812500
# Load data
cool = pd.read_csv("C:/Users/Korisnik/Desktop/Škola/THESIS ideas/df_top_cool.csv")
cool.replace('..', np.nan, inplace=True)
cool['date'] = pd.to_datetime(cool[['year', 'month']].assign(day=1))
cool = cool[~cool['country'].isin(['Russian Federation'])]
# Type conversions
cool['year'] = pd.to_numeric(cool['year'], downcast='integer', errors='coerce')
cool['month'] = pd.to_numeric(cool['month'], downcast='integer', errors='coerce')
cool['arrivals'] = pd.to_numeric(cool['arrivals'], errors='coerce')
cool['overnights'] = pd.to_numeric(cool['overnights'], errors='coerce')
cool['unemployment_rate'] = pd.to_numeric(cool['unemployment_rate'], errors='coerce')
cool['exchange_rate'] = pd.to_numeric(cool['exchange_rate'], errors='coerce')
cool['cpi'] = pd.to_numeric(cool['cpi'], errors='coerce')
cool['eu_member'] = cool['eu_member'].astype(int)
# Add Schengen and Euro info
schengen_entry_year = {
'Austria': 1995, 'Belgium': 1995, 'Czech Republic': 2007, 'Denmark': 2001,
'Finland': 1996, 'France': 1995, 'Germany': 1995, 'Hungary': 2007,
'Italy': 1997, 'Netherlands': 1995, 'Norway': 2001, 'Poland': 2007,
'Portugal': 1995, 'Slovakia': 2007, 'Slovenia': 2007, 'Spain': 1995,
'Sweden': 2001, 'Switzerland': 2008, 'Romania': 2024,
}
cool['schengen_member'] = cool.apply(lambda row: int(row['year'] >= schengen_entry_year.get(row['country'], np.inf)), axis=1).astype('int8')
# Add Google Trends
td = pd.read_csv("C:/Users/Korisnik/Desktop/Škola/THESIS ideas/multiTimeline.csv")
td['date'] = pd.to_datetime(td['date']).dt.to_period('M').dt.to_timestamp()
td_long = td.melt(id_vars='date', var_name='country', value_name='google_trends')
cool['date'] = pd.to_datetime(cool['date']).dt.to_period('M').dt.to_timestamp()
cool = cool.merge(td_long, on=['date', 'country'], how='left')
cool['google_trends'] = cool['google_trends'].fillna(-1)
# Log transform + lags
cool = cool.sort_values(['country', 'date']).drop_duplicates(subset=['country', 'date']).reset_index(drop=True)
cool['arrivals'] = np.log1p(cool['arrivals'])
cool['overnights'] = np.log1p(cool['overnights'])
cool['exchange_rate'] = np.log1p(cool['exchange_rate'])
cool['cpi'] = np.log1p(cool['cpi'])
cool['overnights_next_month'] = cool.groupby('country')['arrivals'].shift(-1)
month_names = {
1: 'January', 2: 'February', 3: 'March', 4: 'April',
5: 'May', 6: 'June', 7: 'July', 8: 'August',
9: 'September', 10: 'October', 11: 'November', 12: 'December'
}
cool['month_name'] = cool['month'].map(month_names)
ohe_month = pd.get_dummies(cool['month_name'], prefix='month').astype(int)
cool = pd.concat([cool.drop(columns=['month_name']), ohe_month], axis=1)
cool['overnights_lag_1'] = cool.groupby('country')['arrivals'].shift(1)
cool['overnights_lag_3'] = cool.groupby('country')['arrivals'].shift(3)
cool['overnights_lag_6'] = cool.groupby('country')['arrivals'].shift(6)
cool['overnights_lag_12'] = cool.groupby('country')['arrivals'].shift(12)
cool['cpi_lag_1'] = cool.groupby('country')['cpi'].shift(1)
cool['unemp_rate_lag_3'] = cool.groupby('country')['unemployment_rate'].shift(3)
cool['google_trends_lag_1'] = cool.groupby('country')['google_trends'].shift(1)
cool['google_trends_lag_3'] = cool.groupby('country')['google_trends'].shift(3)
cool = cool[cool['date'] >= '2001-01-01']
cool_sub2 = cool[cool['country'].isin(['Germany', 'Italy', 'Poland', 'Slovenia', 'Switzerland','USA'])].copy()
cool_sub2 = cool_sub2[cool_sub2['overnights_next_month'].notna()].copy()
cool_sub2['country_encoded'] = LabelEncoder().fit_transform(cool_sub2['country'])
month_cols = [col for col in cool_sub2.columns if col.startswith('month_')]
X_numeric = cool_sub2[[
'unemp_rate_lag_3', 'exchange_rate', 'cpi_lag_1',
'overnights_lag_1', 'overnights_lag_3', 'overnights_lag_6', 'overnights_lag_12',
'google_trends_lag_1', 'google_trends_lag_3', 'schengen_member', 'euro_adopted'] + month_cols].values
X_country_array = cool_sub2['country_encoded'].astype('int32').values
y = cool_sub2['overnights_next_month'].values
train_end = pd.Timestamp("2016-12-31")
val_end = pd.Timestamp("2020-12-31")
train_mask = cool_sub2['date'] <= train_end
val_mask = (cool_sub2['date'] > train_end) & (cool_sub2['date'] <= val_end)
test_mask = cool_sub2['date'] > val_end
scaler = StandardScaler()
X_num_train = scaler.fit_transform(X_numeric[train_mask.to_numpy()])
X_num_val = scaler.transform(X_numeric[val_mask.to_numpy()])
X_num_test = scaler.transform(X_numeric[test_mask.to_numpy()])
X_numeric_scaled_all = scaler.transform(X_numeric)
X_cat_train = X_country_array[train_mask.to_numpy()]
X_cat_val = X_country_array[val_mask.to_numpy()]
X_cat_test = X_country_array[test_mask.to_numpy()]
y_train = y[train_mask.to_numpy()]
y_val = y[val_mask.to_numpy()]
y_test = y[test_mask.to_numpy()]
input_numeric = Input(shape=(X_num_train.shape[1],), name="numeric_input")
input_country = Input(shape=(1,), dtype='int32', name="country_input")
embedding = Embedding(input_dim=len(np.unique(X_country_array)), output_dim=6)(input_country)
embedding_flat = Flatten()(embedding)
x = Concatenate()([input_numeric, embedding_flat])
x = Dense(64, activation='tanh')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Dense(32, activation='tanh')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
output = Dense(1)(x)
model0 = Model(inputs=[input_numeric, input_country], outputs=output)
model0.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
early_stop = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
history = model0.fit(
[X_num_train, X_cat_train], y_train,
validation_data=([X_num_val, X_cat_val], y_val),
epochs=100, batch_size=16)
model0.summary()
Epoch 1/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 15s 23ms/step - loss: 96.2673 - val_loss: 83.4533 Epoch 2/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 14ms/step - loss: 82.6436 - val_loss: 67.1095 Epoch 3/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 19ms/step - loss: 67.3150 - val_loss: 48.3680 Epoch 4/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step - loss: 49.8187 - val_loss: 32.1284 Epoch 5/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 30.1898 - val_loss: 14.7346 Epoch 6/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 15.6023 - val_loss: 6.2273 Epoch 7/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 7.5336 - val_loss: 3.6870 Epoch 8/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step - loss: 4.6136 - val_loss: 2.5168 Epoch 9/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 3s 37ms/step - loss: 4.1666 - val_loss: 2.2894 Epoch 10/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step - loss: 3.6290 - val_loss: 2.3527 Epoch 11/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 3.4119 - val_loss: 2.3391 Epoch 12/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 4s 51ms/step - loss: 3.8713 - val_loss: 2.3436 Epoch 13/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 4s 18ms/step - loss: 3.3307 - val_loss: 2.2364 Epoch 14/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 2s 14ms/step - loss: 2.7160 - val_loss: 2.2157 Epoch 15/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step - loss: 3.0357 - val_loss: 2.1943 Epoch 16/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 12ms/step - loss: 3.1373 - val_loss: 2.1400 Epoch 17/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 2s 16ms/step - loss: 2.4436 - val_loss: 2.1681 Epoch 18/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 2s 21ms/step - loss: 2.6754 - val_loss: 2.1685 Epoch 19/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 2.3551 - val_loss: 2.0866 Epoch 20/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 2.6772 - val_loss: 2.0352 Epoch 21/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 2s 13ms/step - loss: 2.1725 - val_loss: 1.9905 Epoch 22/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 2s 15ms/step - loss: 2.4136 - val_loss: 2.1027 Epoch 23/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step - loss: 2.3841 - val_loss: 2.1134 Epoch 24/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 12ms/step - loss: 2.0762 - val_loss: 2.1707 Epoch 25/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step - loss: 2.2028 - val_loss: 1.9971 Epoch 26/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step - loss: 1.7626 - val_loss: 2.0448 Epoch 27/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 14ms/step - loss: 1.9955 - val_loss: 2.0780 Epoch 28/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - loss: 1.9399 - val_loss: 2.0611 Epoch 29/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - loss: 1.7777 - val_loss: 2.0006 Epoch 30/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - loss: 1.9445 - val_loss: 1.9586 Epoch 31/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - loss: 1.7657 - val_loss: 2.0208 Epoch 32/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 1.9710 - val_loss: 2.0420 Epoch 33/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.6780 - val_loss: 1.9459 Epoch 34/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 1.6586 - val_loss: 1.9306 Epoch 35/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 1.5943 - val_loss: 1.9734 Epoch 36/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.4743 - val_loss: 2.0016 Epoch 37/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 2s 24ms/step - loss: 1.7330 - val_loss: 1.9821 Epoch 38/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 3s 20ms/step - loss: 1.6734 - val_loss: 1.9506 Epoch 39/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step - loss: 1.6656 - val_loss: 1.8936 Epoch 40/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step - loss: 1.4171 - val_loss: 1.9393 Epoch 41/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 1.3501 - val_loss: 1.9552 Epoch 42/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.5745 - val_loss: 2.0136 Epoch 43/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 1.6027 - val_loss: 1.9249 Epoch 44/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 1.4072 - val_loss: 1.9168 Epoch 45/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.2585 - val_loss: 1.9042 Epoch 46/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 12ms/step - loss: 1.3201 - val_loss: 1.9343 Epoch 47/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 2s 15ms/step - loss: 1.3753 - val_loss: 1.9005 Epoch 48/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 1.4154 - val_loss: 1.8944 Epoch 49/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step - loss: 1.2585 - val_loss: 1.9297 Epoch 50/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step - loss: 1.1731 - val_loss: 1.9718 Epoch 51/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.3242 - val_loss: 1.9290 Epoch 52/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 1.2209 - val_loss: 1.9211 Epoch 53/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 1.2861 - val_loss: 1.9466 Epoch 54/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 1.2940 - val_loss: 1.8893 Epoch 55/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - loss: 1.2450 - val_loss: 1.9128 Epoch 56/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.2355 - val_loss: 1.9380 Epoch 57/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.1741 - val_loss: 1.9609 Epoch 58/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.2285 - val_loss: 1.9153 Epoch 59/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 1.1282 - val_loss: 1.9195 Epoch 60/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 1.1603 - val_loss: 1.9625 Epoch 61/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.1549 - val_loss: 1.8665 Epoch 62/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - loss: 1.1905 - val_loss: 1.9088 Epoch 63/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step - loss: 1.0087 - val_loss: 1.9052 Epoch 64/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 2s 21ms/step - loss: 0.9954 - val_loss: 1.9662 Epoch 65/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 2s 12ms/step - loss: 1.1956 - val_loss: 1.9599 Epoch 66/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 17ms/step - loss: 1.1244 - val_loss: 1.8904 Epoch 67/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step - loss: 1.1630 - val_loss: 1.9382 Epoch 68/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 1.1095 - val_loss: 1.9623 Epoch 69/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 1.0993 - val_loss: 1.9723 Epoch 70/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step - loss: 1.0702 - val_loss: 1.9438 Epoch 71/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.1175 - val_loss: 1.9137 Epoch 72/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.0992 - val_loss: 1.9411 Epoch 73/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 1.0389 - val_loss: 2.0290 Epoch 74/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step - loss: 0.9781 - val_loss: 1.9440 Epoch 75/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 14ms/step - loss: 0.9713 - val_loss: 1.9440 Epoch 76/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 17ms/step - loss: 1.1828 - val_loss: 1.9852 Epoch 77/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 2s 19ms/step - loss: 1.0950 - val_loss: 1.9141 Epoch 78/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 3s 29ms/step - loss: 1.0716 - val_loss: 1.9159 Epoch 79/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step - loss: 1.0330 - val_loss: 1.9094 Epoch 80/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 1.1201 - val_loss: 1.8987 Epoch 81/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 1.0059 - val_loss: 1.9590 Epoch 82/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 1.1762 - val_loss: 1.9424 Epoch 83/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 2s 15ms/step - loss: 1.0011 - val_loss: 1.9317 Epoch 84/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - loss: 1.1152 - val_loss: 1.9488 Epoch 85/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - loss: 1.0686 - val_loss: 2.0011 Epoch 86/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 1.0379 - val_loss: 1.9176 Epoch 87/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.0014 - val_loss: 1.9522 Epoch 88/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 0.9960 - val_loss: 1.9251 Epoch 89/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 0.9819 - val_loss: 1.9313 Epoch 90/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 1.0863 - val_loss: 1.9259 Epoch 91/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 1.0982 - val_loss: 1.9510 Epoch 92/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 1.1539 - val_loss: 1.9678 Epoch 93/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 0.9057 - val_loss: 1.9402 Epoch 94/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step - loss: 0.9511 - val_loss: 1.9875 Epoch 95/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 0.9574 - val_loss: 1.9802 Epoch 96/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 1.1070 - val_loss: 1.9393 Epoch 97/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 17ms/step - loss: 1.0748 - val_loss: 1.9470 Epoch 98/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step - loss: 1.1290 - val_loss: 1.9775 Epoch 99/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 1.1856 - val_loss: 1.9335 Epoch 100/100 72/72 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 0.9632 - val_loss: 1.9449
Model: "functional_42"
┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ Connected to ┃ ┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩ │ country_input │ (None, 1) │ 0 │ - │ │ (InputLayer) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ embedding_42 │ (None, 1, 6) │ 36 │ country_input[0]… │ │ (Embedding) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ numeric_input │ (None, 25) │ 0 │ - │ │ (InputLayer) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ flatten_42 │ (None, 6) │ 0 │ embedding_42[0][… │ │ (Flatten) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ concatenate_42 │ (None, 31) │ 0 │ numeric_input[0]… │ │ (Concatenate) │ │ │ flatten_42[0][0] │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dense_126 (Dense) │ (None, 64) │ 2,048 │ concatenate_42[0… │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ batch_normalizatio… │ (None, 64) │ 256 │ dense_126[0][0] │ │ (BatchNormalizatio… │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dropout_84 │ (None, 64) │ 0 │ batch_normalizat… │ │ (Dropout) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dense_127 (Dense) │ (None, 32) │ 2,080 │ dropout_84[0][0] │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ batch_normalizatio… │ (None, 32) │ 128 │ dense_127[0][0] │ │ (BatchNormalizatio… │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dropout_85 │ (None, 32) │ 0 │ batch_normalizat… │ │ (Dropout) │ │ │ │ ├─────────────────────┼───────────────────┼────────────┼───────────────────┤ │ dense_128 (Dense) │ (None, 1) │ 33 │ dropout_85[0][0] │ └─────────────────────┴───────────────────┴────────────┴───────────────────┘
Total params: 13,361 (52.20 KB)
Trainable params: 4,389 (17.14 KB)
Non-trainable params: 192 (768.00 B)
Optimizer params: 8,780 (34.30 KB)
# Baseline predictions
baseline_preds = model0.predict([X_num_test, X_cat_test]).flatten()
baseline_mse = mean_squared_error(y_test, baseline_preds)
feature_names = [
'unemployment_rate', 'exchange_rate', 'cpi_lag_1',
'overnights_lag_1', 'overnights_lag_3', 'overnights_lag_6', 'overnights_lag_12',
'google_trends_lag_1', 'google_trends_lag_3', 'schengen_member', 'euro_adopted']+month_cols
importances = []
for i in range(X_num_test.shape[1]):
X_permuted = X_num_test.copy()
X_permuted[:, i] = np.random.permutation(X_permuted[:, i])
permuted_preds = model0.predict([X_permuted, X_cat_test]).flatten()
permuted_mse = mean_squared_error(y_test, permuted_preds)
importances.append(permuted_mse - baseline_mse)
#Handle grouped month dummies so its not one bar for each month
month_indices = [i for i, f in enumerate(feature_names) if f.startswith('month_')]
if month_indices:
# Safe grouped permutation
X_month_permuted = X_num_test.copy()
row_perm = np.random.permutation(len(X_month_permuted))
X_month_permuted[:, month_indices] = X_month_permuted[row_perm][:, month_indices]
month_preds = model0.predict([X_month_permuted, X_cat_test]).flatten()
month_importance = mean_squared_error(y_test, month_preds) - baseline_mse
else:
month_importance = 0 # fallback if no month cols present
filtered_names = [f for i, f in enumerate(feature_names) if i not in month_indices]
filtered_importances = [imp for i, imp in enumerate(importances) if i not in month_indices]
feature_names_final = filtered_names + ['month_group']
importances_final = filtered_importances + [month_importance]
sorted_pairs = sorted(zip(importances_final, feature_names_final), reverse=True)
importances_sorted, feature_names_sorted = zip(*sorted_pairs)
plt.figure(figsize=(12, 6))
bars = plt.barh(feature_names_sorted, importances_sorted)
plt.xlabel("Increase in MSE when shuffled")
plt.title("Model 1: Overnights, subset of high-volume countries (2024)")
plt.gca().invert_yaxis()
# Add labels
for bar in bars:
width = bar.get_width()
plt.text(width + 0.001, bar.get_y() + bar.get_height() / 2,
f"{width:.4f}", va='center')
plt.tight_layout()
plt.show()
9/9 ━━━━━━━━━━━━━━━━━━━━ 1s 57ms/step 9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 12ms/step 9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 11ms/step 9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 11ms/step 9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 19ms/step 9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step 9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step 9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 42ms/step 9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 10ms/step 9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 17ms/step 9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 11ms/step 9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step 9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 35ms/step 9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 24ms/step 9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 28ms/step 9/9 ━━━━━━━━━━━━━━━━━━━━ 1s 60ms/step 9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 21ms/step 9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 18ms/step 9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 11ms/step 9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 10ms/step 9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step 9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step 9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step 9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step 9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step
## Model evalauation
# Predict and flatten
train_preds = model0.predict([X_num_train, X_cat_train]).flatten()
test_preds = model0.predict([X_num_test, X_cat_test]).flatten()
# MSE
train_mse = mean_squared_error(y_train, train_preds)
test_mse = mean_squared_error(y_test, test_preds)
# RMSE
train_rmse0 = np.sqrt(train_mse)
test_rmse0 = np.sqrt(test_mse)
#MAE
train_mae0 = mean_absolute_error(y_train, train_preds)
test_mae0 = mean_absolute_error(y_test, test_preds)
# MAPE
train_mape0 = mean_absolute_percentage_error(y_train, train_preds)
test_mape0 = mean_absolute_percentage_error(y_test, test_preds)
# R-squared
train_r2_0 = r2_score(y_train, train_preds)
test_r2_0 = r2_score(y_test, test_preds)
# Print all metrics
print(f"Train MSE: {train_mse:.4f}, RMSE: {train_rmse0:.4f}, MAE: {train_mae0:.4f}, MAPE: {train_mape0:.4f}, R²: {train_r2_0:.4f}")
print(f"Test MSE: {test_mse:.4f}, RMSE: {test_rmse0:.4f}, MAE: {test_mae0:.4f}, MAPE: {test_mape0:.4f}, R²: {test_r2_0:.4f}")
# Plot training history
plt.figure(figsize=(10, 4))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training History')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.legend()
plt.grid(True)
plt.show()
36/36 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step 9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 30ms/step Train MSE: 0.0881, RMSE: 0.2968, MAE: 0.2366, MAPE: 0.0257, R²: 0.9726 Test MSE: 0.4140, RMSE: 0.6434, MAE: 0.4859, MAPE: 0.0480, R²: 0.8591
# Global trend plot (total actual vs predicted per month)
y_pred_all = model0.predict([X_numeric_scaled_all, X_country_array]).flatten()
y_actual_all = np.expm1(y)
y_pred_all = np.expm1(y_pred_all)
df_plot = cool_sub2.copy()
df_plot['actual'] = y_actual_all
df_plot['predicted'] = y_pred_all
monthly_totals_all = df_plot.groupby('date')[['actual', 'predicted']].sum()
plt.figure(figsize=(16, 6))
plt.plot(monthly_totals_all.index, monthly_totals_all['actual'], label='Total Actual', linewidth=2)
plt.plot(monthly_totals_all.index, monthly_totals_all['predicted'], label='Total Predicted', linestyle='--', alpha=0.8)
plt.title("Total Monthly Overnights Subset(2024)")
plt.xlabel("Date")
plt.ylabel("Total Overnights")
plt.xticks(rotation=90)
plt.ticklabel_format(style='plain', axis='y')
plt.legend()
plt.tight_layout()
ax = plt.gca()
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=6))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.xticks(rotation=45)
plt.xlim(monthly_totals_all.index.min(), monthly_totals_all.index.max())
plt.axvline(x=train_end, color='gray', linestyle='--', linewidth=1.5, label='Train/Val Split')
plt.axvline(x=val_end, color='black', linestyle='--', linewidth=1.5, label='Val/Test Split')
plt.text(train_end, plt.ylim()[1]*0.95, 'Training set', rotation=90, ha='right', va='top', color='gray')
plt.text(val_end, plt.ylim()[1]*0.95, 'Testing set', rotation=90, ha='right', va='top', color='black')
plt.grid(True, linestyle='--', linewidth=0.5, alpha=0.6)
plt.show()
54/54 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step
# Country-level plots
countries = df_plot['country'].unique()
fig, axes = plt.subplots(len(countries) // 3 + 1, 3, figsize=(18, 3 * (len(countries) // 3 + 1)), sharex=False, sharey=False)
axes = axes.flatten()
for i, country in enumerate(countries):
country_df = df_plot[df_plot['country'] == country].groupby('date')[['actual', 'predicted']].sum()
ax = axes[i]
ax.plot(country_df.index, country_df['actual'], label='Actual')
ax.plot(country_df.index, country_df['predicted'], label='Predicted', linestyle='--')
ax.set_title(country)
# Show x-axis ticks every 3 years
ax.xaxis.set_major_locator(mdates.YearLocator(base=3))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
ax.tick_params(axis='x', rotation=45)
ax.grid(True, linestyle='--', linewidth=0.5, alpha=0.6)
# Add legend only once
if i == 0:
ax.legend()
# Remove unused subplots
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
fig.tight_layout()
fig.suptitle('Actual vs Predicted Monthly Arrivals (2024)', fontsize=16, y=1.02)
plt.show()
# df_plot: DataFrame with columns ['country', 'date', 'actual', 'predicted']
# List of countries
countries = df_plot['country'].unique()
results = []
for country in countries:
# Subset data for this country (test set only, if you want)
country_df = df_plot[(df_plot['country'] == country) & (df_plot['date'] > val_end)]
actual = country_df['actual'].values
pred = country_df['predicted'].values
if len(actual) == 0:
continue # Skip countries with no data in test set
mse = round(mean_squared_error(actual, pred),6)
rmse = np.sqrt(mse)
mae = mean_absolute_error(actual, pred)
mape = mean_absolute_percentage_error(actual, pred)
r2 = r2_score(actual, pred)
results.append({
'country': country,
'MSE': mse,
'RMSE': rmse,
'MAE': mae,
'MAPE': mape,
'R2': r2
})
# Convert to DataFrame for easy export or display
country_metrics = pd.DataFrame(results)
print(country_metrics)
country MSE RMSE MAE MAPE R2 0 Germany 4.024194e+10 200603.944104 121130.643923 0.450907 0.560532 1 Italy 7.848145e+08 28014.540743 16760.034107 0.388984 0.910066 2 Poland 5.813232e+09 76244.554155 43257.363849 0.408261 0.572232 3 Slovenia 3.027006e+09 55018.231569 37428.719706 0.524595 0.755425 4 Switzerland 1.977036e+08 14060.710910 7427.436933 0.330616 0.659180 5 USA 8.089627e+08 28442.269892 19675.270503 0.460863 0.580131